From d732ac1e2cf7a845867cf3473e6a52cdd8caca85 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Tue, 3 Dec 2024 13:39:36 -0500 Subject: [PATCH 01/14] add summary-statistics-filepath --- src/fairscape_cli/models/dataset.py | 110 +++++++++++++++++++++++---- src/fairscape_cli/rocrate/rocrate.py | 48 +++++++++--- 2 files changed, 133 insertions(+), 25 deletions(-) diff --git a/src/fairscape_cli/models/dataset.py b/src/fairscape_cli/models/dataset.py index 1ca3a8f..824548e 100644 --- a/src/fairscape_cli/models/dataset.py +++ b/src/fairscape_cli/models/dataset.py @@ -2,6 +2,8 @@ FairscapeBaseModel, Identifier ) + +from fairscape_cli.models.computation import GenerateComputation, Computation from fairscape_cli.config import ( NAAN ) @@ -22,6 +24,7 @@ BaseModel, constr, Field, + Tuple, AnyUrl, field_serializer ) @@ -44,6 +47,7 @@ class Dataset(FairscapeBaseModel): derivedFrom: Optional[List[str]] = Field(default=[]) usedBy: Optional[List[str]] = Field(default=[]) contentUrl: Optional[str] = Field(default=None) + hasSummaryStatistics: Optional[str] = Field(default=None) #@field_serializer('datePublished') #def serialize_date_published(self, datePublished: datetime): @@ -68,11 +72,13 @@ def GenerateDataset( usedBy: Optional[List[str]], generatedBy: Optional[List[str]], filepath: Optional[str], - cratePath + cratePath, + summary_stats_guid: Optional[str] = None ): - sq = GenerateDatetimeSquid() - guid = f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-{sq}" + if not guid: + sq = GenerateDatetimeSquid() + guid = f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-{sq}" datasetMetadata = { "@id": guid, @@ -88,22 +94,14 @@ def GenerateDataset( "additionalDocumentation": additionalDocumentation, "format": dataFormat, "schema": schema, - # sanitize input lists of newline breaks - "derivedFrom": [ - derived.strip("\n") for derived in derivedFrom - ], - "usedBy": [ - used.strip("\n") for used in usedBy - ], - "generatedBy": [ - gen.strip("\n") for gen in generatedBy - ] + "derivedFrom": [derived.strip("\n") for derived in derivedFrom], + "usedBy": [used.strip("\n") for used in usedBy], + "generatedBy": [gen.strip("\n") for gen in generatedBy], + "hasSummaryStatistics": summary_stats_guid } datasetMetadata['contentURL'] = setRelativeFilepath(cratePath, filepath) - datasetInstance = Dataset.model_validate(datasetMetadata) - return datasetInstance @@ -136,4 +134,84 @@ def setRelativeFilepath(cratePath, filePath): # if relative filepath datasetPath = pathlib.Path(filePath).absolute() relativePath = datasetPath.relative_to(rocratePath) - return f"file:///{str(relativePath)}" \ No newline at end of file + return f"file:///{str(relativePath)}" + + +def generate_summary_stats_elements( + name: str, + author: str, + keywords: List[str], + date_published: str, + version: str, + associated_publication: Optional[str], + additional_documentation: Optional[str], + schema: Optional[str], + dataset_guid: str, + summary_statistics_filepath: str, + crate_path: pathlib.Path +) -> Tuple[str, Dataset, Computation]: + """Generate summary statistics dataset and computation elements + + Args: + name: Name of the main dataset + author: Author of the dataset + keywords: Dataset keywords + date_published: Publication date + version: Dataset version + associated_publication: Optional associated publication + additional_documentation: Optional additional documentation + schema: Optional schema + dataset_guid: GUID of the main dataset + summary_statistics_filepath: Path to summary statistics file + crate_path: Path to RO-Crate + + Returns: + Tuple containing: + - Summary statistics GUID + - Summary statistics Dataset instance + - Computation instance that generated the summary statistics + """ + # Generate GUIDs + sq_stats = GenerateDatetimeSquid() + summary_stats_guid = f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-stats-{sq_stats}" + + sq_comp = GenerateDatetimeSquid() + computation_guid = f"ark:{NAAN}/computation-{name.lower().replace(' ', '-')}-stats-{sq_comp}" + + # Create computation instance + computation_instance = GenerateComputation( + guid=computation_guid, + name=f"Summary Statistics Computation for {name}", + runBy=author, + command="", + dateCreated=date_published, + description=f"Computation that generated summary statistics for dataset: {name}", + keywords=keywords, + usedSoftware=[], + usedDataset=[dataset_guid], + generated=[summary_stats_guid] + ) + + # Create summary statistics dataset + summary_stats_instance = GenerateDataset( + guid=summary_stats_guid, + url=None, + author=author, + name=f"{name} - Summary Statistics", + description=f"Summary statistics for dataset: {name}", + keywords=keywords, + datePublished=date_published, + version=version, + associatedPublication=associated_publication, + additionalDocumentation=additional_documentation, + dataFormat='pdf', + schema=schema, + derivedFrom=[], + generatedBy=[computation_guid], + usedBy=[], + filepath=summary_statistics_filepath, + cratePath=crate_path, + summary_stats_guid=None + ) + + return summary_stats_guid, summary_stats_instance, computation_instance \ No newline at end of file diff --git a/src/fairscape_cli/rocrate/rocrate.py b/src/fairscape_cli/rocrate/rocrate.py index 2b0438b..779a2b8 100644 --- a/src/fairscape_cli/rocrate/rocrate.py +++ b/src/fairscape_cli/rocrate/rocrate.py @@ -5,9 +5,12 @@ from pydantic import ValidationError from datetime import datetime - +from fairscape_cli.config import ( + NAAN +) from fairscape_cli.models.utils import ( - FileNotInCrateException + FileNotInCrateException, + GenerateDatetimeSquid ) from fairscape_cli.models import ( Dataset, @@ -204,6 +207,7 @@ def registerSoftware( @click.option('--keywords', required=True, multiple=True) @click.option('--data-format', required=True) @click.option('--filepath', required=True) +@click.option('--summary-statistics-filepath', required=False, type=click.Path(exists=True)) @click.option('--used-by', required=False, multiple=True) @click.option('--derived-from', required=False, multiple=True) @click.option('--generated-by', required=False, multiple=True) @@ -224,6 +228,7 @@ def registerDataset( keywords: List[str], data_format: str, filepath: str, + summary_statistics_filepath: Optional[str], used_by: Optional[List[str]], derived_from: Optional[List[str]], generated_by: Optional[List[str]], @@ -231,8 +236,7 @@ def registerDataset( associated_publication: Optional[str], additional_documentation: Optional[List[str]], ): - """Register Dataset object metadata with the specified RO-Crate - """ + """Register Dataset object metadata with the specified RO-Crate""" try: crate_instance = ReadROCrateMetadata(rocrate_path) except Exception as exc: @@ -240,8 +244,33 @@ def registerDataset( ctx.exit(code=1) try: + # Generate main dataset GUID + sq_dataset = GenerateDatetimeSquid() + dataset_guid = guid if guid else f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-{sq_dataset}" + + summary_stats_guid = None + elements = [] + + # Handle summary statistics if provided + if summary_statistics_filepath: + summary_stats_guid, summary_stats_instance, computation_instance = generate_summary_stats_elements( + name=name, + author=author, + keywords=keywords, + date_published=date_published, + version=version, + associated_publication=associated_publication, + additional_documentation=additional_documentation, + schema=schema, + dataset_guid=dataset_guid, + summary_statistics_filepath=summary_statistics_filepath, + crate_path=rocrate_path + ) + elements.extend([computation_instance, summary_stats_instance]) + + # Generate main dataset dataset_instance = GenerateDataset( - guid=guid, + guid=dataset_guid, url=url, author=author, name=name, @@ -257,9 +286,12 @@ def registerDataset( generatedBy=generated_by, usedBy=used_by, filepath=filepath, - cratePath=rocrate_path + cratePath=rocrate_path, + summary_stats_guid=summary_stats_guid ) - AppendCrate(cratePath = rocrate_path, elements=[dataset_instance]) + + elements.insert(0, dataset_instance) + AppendCrate(cratePath=rocrate_path, elements=elements) click.echo(dataset_instance.guid) except FileNotInCrateException as e: @@ -275,8 +307,6 @@ def registerDataset( click.echo(f"ERROR: {str(exc)}") ctx.exit(code=1) - - @register.command('computation') @click.argument('rocrate-path', type=click.Path(exists=True, path_type=pathlib.Path)) From 772ac103eb389f528643cc6e15208c22275f058e Mon Sep 17 00:00:00 2001 From: jniestroy Date: Tue, 3 Dec 2024 13:43:00 -0500 Subject: [PATCH 02/14] also add summary stats to add --- src/fairscape_cli/models/dataset.py | 2 +- src/fairscape_cli/rocrate/rocrate.py | 55 +++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/src/fairscape_cli/models/dataset.py b/src/fairscape_cli/models/dataset.py index 824548e..75590fc 100644 --- a/src/fairscape_cli/models/dataset.py +++ b/src/fairscape_cli/models/dataset.py @@ -137,7 +137,7 @@ def setRelativeFilepath(cratePath, filePath): return f"file:///{str(relativePath)}" -def generate_summary_stats_elements( +def generateSummaryStatsElements( name: str, author: str, keywords: List[str], diff --git a/src/fairscape_cli/rocrate/rocrate.py b/src/fairscape_cli/rocrate/rocrate.py index 779a2b8..7893a81 100644 --- a/src/fairscape_cli/rocrate/rocrate.py +++ b/src/fairscape_cli/rocrate/rocrate.py @@ -24,7 +24,8 @@ ReadROCrateMetadata, AppendCrate, CopyToROCrate, - BagIt + BagIt, + generateSummaryStatsElements ) from typing import ( @@ -253,7 +254,7 @@ def registerDataset( # Handle summary statistics if provided if summary_statistics_filepath: - summary_stats_guid, summary_stats_instance, computation_instance = generate_summary_stats_elements( + summary_stats_guid, summary_stats_instance, computation_instance = generateSummaryStatsElements( name=name, author=author, keywords=keywords, @@ -464,6 +465,8 @@ def software( @click.option('--data-format', required=True) @click.option('--source-filepath', required=True) @click.option('--destination-filepath', required=True) +@click.option('--summary-statistics-source', required=False, type=click.Path(exists=True)) +@click.option('--summary-statistics-destination', required=False, type=click.Path()) @click.option('--used-by', required=False, multiple=True) @click.option('--derived-from', required=False, multiple=True) @click.option('--generated-by', required=False, multiple=True) @@ -485,6 +488,8 @@ def dataset( data_format, source_filepath, destination_filepath, + summary_statistics_source, + summary_statistics_destination, used_by, derived_from, generated_by, @@ -492,9 +497,7 @@ def dataset( associated_publication, additional_documentation, ): - """Add a Dataset file and its metadata to the RO-Crate. - """ - + """Add a Dataset file and its metadata to the RO-Crate.""" try: crateInstance = ReadROCrateMetadata(rocrate_path) except Exception as exc: @@ -502,9 +505,40 @@ def dataset( ctx.exit(code=1) try: + # Copy main dataset file CopyToROCrate(source_filepath, destination_filepath) + + # Generate main dataset GUID + sq_dataset = GenerateDatetimeSquid() + dataset_guid = guid if guid else f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-{sq_dataset}" + + summary_stats_guid = None + elements = [] + + # Handle summary statistics if provided + if summary_statistics_source and summary_statistics_destination: + # Copy summary statistics file + CopyToROCrate(summary_statistics_source, summary_statistics_destination) + + # Generate summary statistics elements + summary_stats_guid, summary_stats_instance, computation_instance = generateSummaryStatsElements( + name=name, + author=author, + keywords=keywords, + date_published=date_published, + version=version, + associated_publication=associated_publication, + additional_documentation=additional_documentation, + schema=schema, + dataset_guid=dataset_guid, + summary_statistics_filepath=summary_statistics_destination, + crate_path=rocrate_path + ) + elements.extend([computation_instance, summary_stats_instance]) + + # Generate main dataset dataset_instance = GenerateDataset( - guid=guid, + guid=dataset_guid, url=url, author=author, name=name, @@ -520,9 +554,12 @@ def dataset( generatedBy=generated_by, usedBy=used_by, filepath=destination_filepath, - cratePath=rocrate_path + cratePath=rocrate_path, + summary_stats_guid=summary_stats_guid ) - AppendCrate(cratePath = rocrate_path, elements=[dataset_instance]) + + elements.insert(0, dataset_instance) + AppendCrate(cratePath=rocrate_path, elements=elements) click.echo(dataset_instance.guid) except ValidationError as e: @@ -533,5 +570,3 @@ def dataset( except Exception as exc: click.echo(f"ERROR: {str(exc)}") ctx.exit(code=1) - - # TODO add to cache From 42f414d3904363150696f991344bcd4a85f48971 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Wed, 4 Dec 2024 13:09:11 -0500 Subject: [PATCH 03/14] lots of changes... imports plus execution code --- src/fairscape_cli/models/__init__.py | 10 +- src/fairscape_cli/models/computation.py | 21 ++-- src/fairscape_cli/models/dataset.py | 69 +++++++----- src/fairscape_cli/models/guid_utils.py | 31 ++++++ src/fairscape_cli/models/rocrate.py | 61 +++++++---- src/fairscape_cli/models/schema/tabular.py | 2 +- src/fairscape_cli/models/software.py | 21 +--- src/fairscape_cli/models/utils.py | 81 +++++--------- src/fairscape_cli/rocrate/rocrate.py | 122 ++++++++++++++++++--- 9 files changed, 268 insertions(+), 150 deletions(-) create mode 100644 src/fairscape_cli/models/guid_utils.py diff --git a/src/fairscape_cli/models/__init__.py b/src/fairscape_cli/models/__init__.py index bdc45ab..41a2ef8 100644 --- a/src/fairscape_cli/models/__init__.py +++ b/src/fairscape_cli/models/__init__.py @@ -1,6 +1,8 @@ from fairscape_cli.models.dataset import ( Dataset, - GenerateDataset + GenerateDataset, + generateSummaryStatsElements, + registerOutputs ) from fairscape_cli.models.software import Software, GenerateSoftware from fairscape_cli.models.computation import Computation, GenerateComputation @@ -9,13 +11,16 @@ GenerateROCrate, ReadROCrateMetadata, AppendCrate, - CopyToROCrate + CopyToROCrate, + UpdateCrate ) from fairscape_cli.models.bagit import BagIt __all__ = [ 'Dataset', 'GenerateDataset', + 'generateSummaryStatsElements', + 'registerOutputs', 'Software', 'GenerateSoftware', 'Computation', @@ -25,5 +30,6 @@ 'ReadROCrateMetadata', 'AppendCrate', 'CopyToROCrate', + 'UpdateCrate', 'BagIt' ] diff --git a/src/fairscape_cli/models/computation.py b/src/fairscape_cli/models/computation.py index fa3f7f3..a4d8a3d 100644 --- a/src/fairscape_cli/models/computation.py +++ b/src/fairscape_cli/models/computation.py @@ -1,19 +1,12 @@ -from fairscape_cli.models.base import FairscapeBaseModel -from fairscape_cli.models.utils import GenerateDatetimeSquid -from fairscape_cli.config import NAAN - -from typing import ( - Optional, - List, - Union, - Dict, -) -from pydantic import ( - Field, - AnyUrl -) import re from datetime import datetime +from typing import Optional, List, Union, Dict + +from pydantic import Field, AnyUrl + +from fairscape_cli.config import NAAN +from fairscape_cli.models.base import FairscapeBaseModel +from fairscape_cli.models.guid_utils import GenerateDatetimeSquid class Computation(FairscapeBaseModel): diff --git a/src/fairscape_cli/models/dataset.py b/src/fairscape_cli/models/dataset.py index 75590fc..bf46cb2 100644 --- a/src/fairscape_cli/models/dataset.py +++ b/src/fairscape_cli/models/dataset.py @@ -1,34 +1,19 @@ -from fairscape_cli.models.base import ( - FairscapeBaseModel, - Identifier -) - -from fairscape_cli.models.computation import GenerateComputation, Computation -from fairscape_cli.config import ( - NAAN -) -from fairscape_cli.models.utils import GenerateDatetimeSquid, FileNotInCrateException -from fairscape_cli.models.schema.tabular import ( - TabularValidationSchema -) - +# Standard library imports import pathlib -from typing import ( - Optional, - List, - Union, - Dict -) +from datetime import datetime +from typing import Optional, List, Union, Dict, Tuple, Set from pydantic import ( BaseModel, constr, Field, - Tuple, AnyUrl, field_serializer ) -from datetime import datetime + +from fairscape_cli.models.base import FairscapeBaseModel +from fairscape_cli.models.guid_utils import GenerateDatetimeSquid +from fairscape_cli.config import NAAN class Dataset(FairscapeBaseModel): @@ -47,7 +32,7 @@ class Dataset(FairscapeBaseModel): derivedFrom: Optional[List[str]] = Field(default=[]) usedBy: Optional[List[str]] = Field(default=[]) contentUrl: Optional[str] = Field(default=None) - hasSummaryStatistics: Optional[str] = Field(default=None) + hasSummaryStatistics: Optional[Union[str, List[str]]] = Field(default=None) #@field_serializer('datePublished') #def serialize_date_published(self, datePublished: datetime): @@ -97,10 +82,11 @@ def GenerateDataset( "derivedFrom": [derived.strip("\n") for derived in derivedFrom], "usedBy": [used.strip("\n") for used in usedBy], "generatedBy": [gen.strip("\n") for gen in generatedBy], + "contentU" "hasSummaryStatistics": summary_stats_guid } - datasetMetadata['contentURL'] = setRelativeFilepath(cratePath, filepath) + datasetMetadata['contentUrl'] = setRelativeFilepath(cratePath, filepath) datasetInstance = Dataset.model_validate(datasetMetadata) return datasetInstance @@ -137,6 +123,7 @@ def setRelativeFilepath(cratePath, filePath): return f"file:///{str(relativePath)}" +from fairscape_cli.models.computation import GenerateComputation, Computation def generateSummaryStatsElements( name: str, author: str, @@ -214,4 +201,36 @@ def generateSummaryStatsElements( summary_stats_guid=None ) - return summary_stats_guid, summary_stats_instance, computation_instance \ No newline at end of file + return summary_stats_guid, summary_stats_instance, computation_instance + +def registerOutputs( + new_files: Set[pathlib.Path], + computation_id: str, + dataset_id: str, + author: str +) -> List[Dict]: + """Register all outputs as datasets""" + output_instances = [] + for file_path in new_files: + file_path_str = str(file_path) + output_instance = GenerateDataset( + guid=None, + name=f"Statistics Output - {file_path.name}", + author=author, # Use the original author + description=f"Statistical analysis output for {dataset_id}", + keywords=["statistics"], + datePublished=datetime.now().isoformat(), + version="1.0", + dataFormat=file_path.suffix[1:], + filepath=file_path_str, + cratePath=str(file_path.parent), + url=None, + associatedPublication=None, + additionalDocumentation=None, + schema=None, + derivedFrom=[], + usedBy=[], + generatedBy=[computation_id] + ) + output_instances.append(output_instance) + return output_instances \ No newline at end of file diff --git a/src/fairscape_cli/models/guid_utils.py b/src/fairscape_cli/models/guid_utils.py new file mode 100644 index 0000000..a85988f --- /dev/null +++ b/src/fairscape_cli/models/guid_utils.py @@ -0,0 +1,31 @@ +from sqids import Sqids +import random +import datetime + +from typing import Set, Dict, List, Optional, Tuple + +from fairscape_cli.config import NAAN + +squids = Sqids(min_length=6) + +def GenerateDatetimeSquid(): + try: + timestamp_int = int(datetime.datetime.now(datetime.UTC).timestamp()) + sq = squids.encode([timestamp_int, random.randint(0, 10000)]) + except: + timestamp_int = int(datetime.datetime.utcnow().timestamp()) + sq = squids.encode([timestamp_int]) + return sq + +def GenerateDatetimeGUID(prefix: str)->str: + try: + timestamp_int = int(datetime.datetime.now(datetime.UTC).timestamp()) + sq = squids.encode([timestamp_int]) + except: + timestamp_int = int(datetime.datetime.utcnow().timestamp()) + sq = squids.encode([timestamp_int]) + return f"ark:{NAAN}/{prefix}-{sq}" + +def GenerateGUID(data: List[int], prefix: str)-> str: + squid_encoded = squids.encode(data) + return f"ark:{NAAN}/{prefix}-{squid_encoded}" \ No newline at end of file diff --git a/src/fairscape_cli/models/rocrate.py b/src/fairscape_cli/models/rocrate.py index 91c8ade..275c8b1 100644 --- a/src/fairscape_cli/models/rocrate.py +++ b/src/fairscape_cli/models/rocrate.py @@ -1,30 +1,16 @@ -from fairscape_cli.models import ( - Software, - Dataset, - Computation -) -from fairscape_cli.models.utils import GenerateDatetimeSquid -from fairscape_cli.config import ( - DEFAULT_CONTEXT, - NAAN -) - import pathlib import shutil import json +from typing import Optional, Union, List, Literal, Dict + from prettytable import PrettyTable -from pydantic import ( - BaseModel, - computed_field, - Field, -) -from typing import ( - Optional, - Union, - List, - Literal, - Dict -) +from pydantic import BaseModel, computed_field, Field + +from fairscape_cli.config import NAAN, DEFAULT_CONTEXT +from fairscape_cli.models.software import Software +from fairscape_cli.models.dataset import Dataset +from fairscape_cli.models.computation import Computation +from fairscape_cli.models.guid_utils import GenerateDatetimeSquid class ROCrateMetadata(BaseModel): guid: Optional[str] = Field(alias="@id", default=None) @@ -321,3 +307,32 @@ def CopyToROCrate(source_filepath: str, destination_filepath: str): # copy the file into the destinationPath shutil.copy(source_path, destination_path) +def UpdateCrate( + cratePath: pathlib.Path, + element: Union[Dataset, Software, Computation] +): + """Update an existing element in the RO-Crate metadata by matching @id + + Args: + cratePath: Path to the RO-Crate directory or metadata file + element: Updated element to replace existing one with matching @id + """ + if cratePath.is_dir(): + cratePath = cratePath / 'ro-crate-metadata.json' + + with cratePath.open("r+") as rocrate_metadata_file: + rocrate_metadata = json.load(rocrate_metadata_file) + + # Find and replace the element with matching @id + for i, existing in enumerate(rocrate_metadata['@graph']): + if existing.get('@id') == element.guid: + rocrate_metadata['@graph'][i] = element.model_dump( + by_alias=True, + exclude_none=True + ) + break + + # Write back the updated metadata + rocrate_metadata_file.seek(0) + rocrate_metadata_file.truncate() + json.dump(rocrate_metadata, rocrate_metadata_file, indent=2) \ No newline at end of file diff --git a/src/fairscape_cli/models/schema/tabular.py b/src/fairscape_cli/models/schema/tabular.py index 387444a..205c372 100644 --- a/src/fairscape_cli/models/schema/tabular.py +++ b/src/fairscape_cli/models/schema/tabular.py @@ -32,7 +32,7 @@ map_arrow_type_to_json_schema ) -from fairscape_cli.models.utils import ( +from fairscape_cli.models.guid_utils import ( GenerateDatetimeSquid ) diff --git a/src/fairscape_cli/models/software.py b/src/fairscape_cli/models/software.py index fb60242..83ebda0 100644 --- a/src/fairscape_cli/models/software.py +++ b/src/fairscape_cli/models/software.py @@ -1,21 +1,12 @@ -from fairscape_cli.models.base import FairscapeBaseModel -from fairscape_cli.models.utils import GenerateDatetimeSquid, FileNotInCrateException -from fairscape_cli.config import NAAN import pathlib - -from pydantic import ( - Field, - AnyUrl, - ConfigDict -) from datetime import datetime -from typing import ( - Optional, - Union, - Dict, - List -) +from typing import Optional, Union, Dict, List +from pydantic import Field, AnyUrl, ConfigDict + +from fairscape_cli.config import NAAN +from fairscape_cli.models.base import FairscapeBaseModel +from fairscape_cli.models.guid_utils import GenerateDatetimeSquid class Software(FairscapeBaseModel): diff --git a/src/fairscape_cli/models/utils.py b/src/fairscape_cli/models/utils.py index b51c6e9..dfe270a 100644 --- a/src/fairscape_cli/models/utils.py +++ b/src/fairscape_cli/models/utils.py @@ -1,70 +1,45 @@ -# Python Interface for Registering Unique GUIDS -from sqids import Sqids -from pydantic import ( - ValidationError -) -from typing import ( - List - ) -import datetime -from fairscape_cli.config import ( - NAAN - ) -import random +from pathlib import Path +from typing import Set, Dict, List, Optional, Tuple +import subprocess -squids = Sqids(min_length=6) - -def GenerateDatetimeSquid(): - try: - timestamp_int = int(datetime.datetime.now(datetime.UTC).timestamp()) - sq = squids.encode([timestamp_int, random.randint(0, 10000)]) - except: - timestamp_int = int(datetime.datetime.utcnow().timestamp()) - sq = squids.encode([timestamp_int]) - - return sq - - -def GenerateDatetimeGUID(prefix: str)->str: - try: - timestamp_int = int(datetime.datetime.now(datetime.UTC).timestamp()) - sq = squids.encode([timestamp_int]) - except: - timestamp_int = int(datetime.datetime.utcnow().timestamp()) - sq = squids.encode([timestamp_int]) - - return f"ark:{NAAN}/{prefix}-{sq}" - -def GenerateGUID(data: List[int], prefix: str)-> str: - squid_encoded = squids.encode(data) - return f"ark:{NAAN}/{prefix}-{squid_encoded}" +from pydantic import ValidationError +from fairscape_cli.models.base import FairscapeBaseModel def InstantiateModel(ctx, metadata: dict, modelInstance): try: modelInstance.model_validate(metadata) return modelInstance - except ValidationError as metadataError: print('ERROR: MetadataValidationError', end='') for validationFailure in metadataError.errors(): print(f'loc: {validationFailure.loc}\tinput: {validationFailure.input}\tmsg: {validationFailure.msg}', end='') ctx.exit(code=1) - - -def ValidateGUID(ctx, param, value): - """ Make sure a GUID reference is reachable return JSON Metadata - """ - # validate fairscape ARK - - # validate DOI - - # validate url - pass - - class FileNotInCrateException(Exception): def __init__(self, cratePath, filePath): self.message = f"Error: FileNotFound inside ro crate\ncratePath: {str(cratePath)}\tfilePath{str(filePath)}" super().__init__(self.message) + +def getDirectoryContents(directory: Path) -> Set[Path]: + """Get set of all files in directory recursively""" + return set(p for p in directory.rglob('*') if p.is_file()) + +def run_command(command: str) -> Tuple[bool, str, str]: + """Execute command and return success status with output""" + try: + result = subprocess.run( + command.split(), + capture_output=True, + text=True + ) + return result.returncode == 0, result.stdout, result.stderr + except Exception as e: + return False, "", str(e) + +def getEntityFromCrate(crate_instance, entity_id: str) -> Optional[FairscapeBaseModel]: + """Get entity from crate by ID""" + for entity in crate_instance.metadataGraph: + if entity.guid == entity_id: + return entity.dict() + return None \ No newline at end of file diff --git a/src/fairscape_cli/rocrate/rocrate.py b/src/fairscape_cli/rocrate/rocrate.py index 7893a81..7fb2c22 100644 --- a/src/fairscape_cli/rocrate/rocrate.py +++ b/src/fairscape_cli/rocrate/rocrate.py @@ -2,36 +2,42 @@ import pathlib import shutil import json -from pydantic import ValidationError from datetime import datetime +from typing import List, Optional, Union -from fairscape_cli.config import ( - NAAN -) +from pydantic import ValidationError + +from fairscape_cli.config import NAAN +from fairscape_cli.models.guid_utils import GenerateDatetimeSquid from fairscape_cli.models.utils import ( - FileNotInCrateException, - GenerateDatetimeSquid + FileNotInCrateException, + getDirectoryContents, + getEntityFromCrate, + run_command ) from fairscape_cli.models import ( + # Core models Dataset, - GenerateDataset, Software, - GenerateSoftware, Computation, + ROCrate, + BagIt, + + # Generator functions + GenerateDataset, + GenerateSoftware, GenerateComputation, GenerateROCrate, - ROCrate, + + # RO Crate operations ReadROCrateMetadata, AppendCrate, CopyToROCrate, - BagIt, - generateSummaryStatsElements -) - -from typing import ( - List, - Optional, - Union + UpdateCrate, + + # Additional utilities + generateSummaryStatsElements, + registerOutputs ) @@ -570,3 +576,85 @@ def dataset( except Exception as exc: click.echo(f"ERROR: {str(exc)}") ctx.exit(code=1) + +################# +# Summary Statistics +################# +@rocrate.command('compute-statistics') +@click.argument('rocrate-path', type=click.Path(exists=True, path_type=pathlib.Path)) +@click.option('--dataset-id', required=True, help='ID of dataset to compute statistics for') +@click.option('--software-id', required=True, help='ID of software to run') +@click.option('--command', required=True, help='Python command to execute (e.g. python)') +@click.pass_context +def compute_statistics( + ctx, + rocrate_path: pathlib.Path, + dataset_id: str, + software_id: str, + command: str +): + """Compute statistics for a dataset using specified software""" + crate_instance = ReadROCrateMetadata(rocrate_path) + initial_files = getDirectoryContents(rocrate_path) + + # Get original dataset info + dataset_info = getEntityFromCrate(crate_instance, dataset_id) + software_info = getEntityFromCrate(crate_instance, software_id) + if not dataset_info or not software_info: + raise ValueError(f"Dataset or software not found in crate") + + # Get original dataset author + original_author = dataset_info.get("author", "Unknown") + dataset_path = dataset_info.get("contentUrl", "").replace("file:///", "") + software_path = software_info.get("contentUrl", "").replace("file:///", "") + + if not dataset_path or not software_path: + raise ValueError("Dataset or software path not found") + + full_command = f"{command} {software_path} {dataset_path} {rocrate_path}" + success, stdout, stderr = run_command(full_command) + if not success: + raise RuntimeError(f"Command failed: {stderr}") + + final_files = getDirectoryContents(rocrate_path) + new_files = final_files - initial_files + if not new_files: + raise RuntimeError("No output files generated") + + computation_instance = GenerateComputation( + guid=None, + name=f"Statistics Computation for {dataset_id}", + runBy="Fairscape-CLI", + command=full_command, + dateCreated=datetime.now().isoformat(), + description=f"Generated statistics\nstdout:\n{stdout}\nstderr:\n{stderr}", + keywords=["statistics"], + usedSoftware=[software_id], + usedDataset=[dataset_id], + generated=[] + ) + + output_instances = registerOutputs( + new_files=new_files, + computation_id=computation_instance.guid, + dataset_id=dataset_id, + author=original_author + ) + + stats_output = [out.guid for out in output_instances] + computation_instance.generated = stats_output + + if stats_output: + # Update the original dataset metadata + dataset_info["hasSummaryStatistics"] = stats_output + # Generate a new Dataset instance with updated metadata + updated_dataset = Dataset.model_validate(dataset_info) + + # Update the dataset in the crate and append new elements + UpdateCrate(cratePath=rocrate_path, element=updated_dataset) + AppendCrate( + cratePath=rocrate_path, + elements=[computation_instance] + output_instances + ) + + click.echo(computation_instance.guid) \ No newline at end of file From ace90e1995966d9c800c4f02a2eb0d36d641c1a2 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Wed, 4 Dec 2024 13:30:54 -0500 Subject: [PATCH 04/14] tests --- tests/stats-compute-tests/numbers.csv | 11 ++ tests/stats-compute-tests/summary.py | 58 ++++++ tests/test_compute_stats.py | 132 +++++++++++++ tests/test_rocrate_api.py | 273 +++++++++++++------------- 4 files changed, 338 insertions(+), 136 deletions(-) create mode 100644 tests/stats-compute-tests/numbers.csv create mode 100644 tests/stats-compute-tests/summary.py create mode 100644 tests/test_compute_stats.py diff --git a/tests/stats-compute-tests/numbers.csv b/tests/stats-compute-tests/numbers.csv new file mode 100644 index 0000000..aa9321c --- /dev/null +++ b/tests/stats-compute-tests/numbers.csv @@ -0,0 +1,11 @@ +column1,column2,column3 +1,0.557412965,0.015765057 +2,0.595715476,4.632460772 +3,1.000511292,0.516892255 +4,3.634542545,16.3678812 +5,0.216278402,0.37567848 +6,3.346647036,3.666700797 +7,2.864322316,2.292766985 +8,0.508136324,0.434491093 +9,5.934758558,1.647603341 +10,1.092459463,1.04885126 \ No newline at end of file diff --git a/tests/stats-compute-tests/summary.py b/tests/stats-compute-tests/summary.py new file mode 100644 index 0000000..633bb35 --- /dev/null +++ b/tests/stats-compute-tests/summary.py @@ -0,0 +1,58 @@ +import pandas as pd +import sys +import os +from pathlib import Path + +def generate_summary_stats(input_path, output_dir): + """ + Generate summary statistics for a CSV file and save to output directory + + Parameters: + input_path (str): Path to input CSV file + output_dir (str): Directory to save output summary statistics + """ + # Read the input file + df = pd.read_csv(input_path) + + # Create summary statistics + summary_stats = pd.DataFrame({ + 'column_name': df.columns, + 'data_type': df.dtypes.astype(str), + 'count': df.count(), + 'null_count': df.isnull().sum(), + 'null_percentage': (df.isnull().sum() / len(df) * 100).round(2), + 'unique_values': df.nunique(), + }) + + # Add numeric column statistics + numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns + summary_stats.loc[summary_stats['column_name'].isin(numeric_cols), 'mean'] = df[numeric_cols].mean() + summary_stats.loc[summary_stats['column_name'].isin(numeric_cols), 'std'] = df[numeric_cols].std() + summary_stats.loc[summary_stats['column_name'].isin(numeric_cols), 'min'] = df[numeric_cols].min() + summary_stats.loc[summary_stats['column_name'].isin(numeric_cols), 'max'] = df[numeric_cols].max() + + # Create output directory if it doesn't exist + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Generate output filename from input filename + input_filename = os.path.basename(input_path) + output_filename = f"summary_stats_{input_filename}" + output_path = os.path.join(output_dir, output_filename) + + # Save summary statistics + summary_stats.to_csv(output_path, index=False) + print(f"Summary statistics saved to: {output_path}") + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python summary.py ") + sys.exit(1) + + input_path = sys.argv[1] + output_dir = sys.argv[2] + + try: + generate_summary_stats(input_path, output_dir) + except Exception as e: + print(f"Error: {str(e)}") + sys.exit(1) \ No newline at end of file diff --git a/tests/test_compute_stats.py b/tests/test_compute_stats.py new file mode 100644 index 0000000..3375502 --- /dev/null +++ b/tests/test_compute_stats.py @@ -0,0 +1,132 @@ +import os +import sys +import pathlib +import json +import shutil +import unittest +import subprocess +import datetime +from typing import Tuple + +class TestStatisticsCliWorkflow(unittest.TestCase): + + def setUp(self): + # Create test directory + self.test_dir = pathlib.Path.cwd() / 'tests' / 'stats-compute-tests' + self.test_dir.mkdir(parents=True, exist_ok=True) + + def tearDown(self): + # Only remove the generated files, not the entire directory + metadata_file = self.test_dir / 'ro-crate-metadata.json' + stats_file = self.test_dir / 'summary_stats_numbers.csv' + + if metadata_file.exists(): + metadata_file.unlink() + if stats_file.exists(): + stats_file.unlink() + + def run_cli_command(self, command: str) -> Tuple[int, str, str]: + """Run a CLI command and return returncode, stdout, stderr""" + process = subprocess.Popen( + command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + stdout, stderr = process.communicate() + return process.returncode, stdout.strip(), stderr.strip() + + def test_cli_workflow(self): + # Change to test directory + os.chdir(self.test_dir) + + # Initialize ROCrate + init_cmd = '''python -m fairscape_cli rocrate init \ + --name "Data Analysis Project" \ + --organization-name "My Organization" \ + --project-name "Data Analysis" \ + --description "A project for analyzing data using summary statistics" \ + --keywords "data-analysis" --keywords "statistics" --keywords "python"''' + + returncode, stdout, stderr = self.run_cli_command(init_cmd) + self.assertEqual(returncode, 0, f"ROCrate init failed: {stderr}") + rocrate_guid = stdout.strip() + + # Register software + software_cmd = f'''python -m fairscape_cli rocrate register software ./ \ + --name "Summary Statistics Generator" \ + --author "Your Name" \ + --version "1.0.0" \ + --description "Python script that generates summary statistics for CSV data" \ + --keywords "data-analysis" --keywords "statistics" --keywords "python" \ + --file-format "text/x-python" \ + --date-modified "{datetime.date.today().isoformat()}" \ + --filepath "summary.py"''' + + returncode, stdout, stderr = self.run_cli_command(software_cmd) + self.assertEqual(returncode, 0, f"Software registration failed: {stderr}") + software_guid = stdout.strip() + + # Register dataset + dataset_cmd = f'''python -m fairscape_cli rocrate register dataset ./ \ + --name "Analysis Dataset" \ + --author "Your Name" \ + --version "1.0.0" \ + --date-published "{datetime.date.today().isoformat()}" \ + --description "Dataset for statistical analysis" \ + --keywords "data-analysis" --keywords "statistics" --keywords "python" \ + --data-format "text/csv" \ + --filepath "numbers.csv"''' + + returncode, stdout, stderr = self.run_cli_command(dataset_cmd) + self.assertEqual(returncode, 0, f"Dataset registration failed: {stderr}") + dataset_guid = stdout.strip() + + # Compute statistics + compute_cmd = f'''python -m fairscape_cli rocrate compute-statistics ./ \ + --dataset-id "{dataset_guid}" \ + --software-id "{software_guid}" \ + --command "python"''' + + returncode, stdout, stderr = self.run_cli_command(compute_cmd) + self.assertEqual(returncode, 0, f"Computation failed: {stderr}") + computation_guid = stdout.strip() + + # Verify the metadata file exists and has correct structure + metadata_file = self.test_dir / 'ro-crate-metadata.json' + self.assertTrue(metadata_file.exists()) + + # Load and verify metadata + with open(metadata_file) as f: + metadata = json.load(f) + + # Basic structure tests + self.assertEqual(metadata['name'], "Data Analysis Project") + self.assertEqual(metadata['@id'], rocrate_guid) + + # Verify all components are present in @graph + guids = [item['@id'] for item in metadata['@graph']] + self.assertIn(software_guid, guids) + self.assertIn(dataset_guid, guids) + self.assertIn(computation_guid, guids) + + # Find computation record + computation = next(item for item in metadata['@graph'] if item['@id'] == computation_guid) + + # Verify computation relationships + self.assertEqual(computation['usedSoftware'], [software_guid]) + self.assertEqual(computation['usedDataset'], [dataset_guid]) + self.assertTrue(len(computation['generated']) > 0) + + # Verify output file exists + output_file = self.test_dir / 'summary_stats_numbers.csv' + self.assertTrue(output_file.exists()) + + # Find dataset record and verify it has summary statistics + dataset = next(item for item in metadata['@graph'] if item['@id'] == dataset_guid) + self.assertTrue('hasSummaryStatistics' in dataset) + self.assertEqual(dataset['hasSummaryStatistics'], computation['generated']) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_rocrate_api.py b/tests/test_rocrate_api.py index d19fbae..2aec4d5 100644 --- a/tests/test_rocrate_api.py +++ b/tests/test_rocrate_api.py @@ -2,6 +2,7 @@ import sys import pathlib import json +import shutil sys.path.insert( 0, @@ -19,145 +20,145 @@ from fairscape_cli.models.dataset import GenerateDataset from fairscape_cli.models.software import GenerateSoftware from fairscape_cli.models.rocrate import ( - GenerateROCrate, - ReadROCrateMetadata, - AppendCrate + GenerateROCrate, + ReadROCrateMetadata, + AppendCrate ) from sqids import Sqids class TestAPI(unittest.TestCase): - - def test_api(self): - rocratePath = pathlib.Path.cwd() / 'tests'/ 'data' / 'test_api' - - # delete the test_api folder - metadataFile = rocratePath / 'ro-crate-metadata.json' - metadataFile.unlink() - - rocrate_metadata = { - "guid": "ark:59853/UVA/B2AI/rocrate_test", - "name": 'test rocrate', - "organizationName": "UVA", - "projectName": "B2AI", - "description": "Testing ROCrate Model", - "keywords": ["test", "fair"], - "path": rocratePath - } - - # touch a file for the dataset to say exists - - rocrate = GenerateROCrate(**rocrate_metadata) - - software_metadata={ - "guid" : "955cf26c-e3a3-4f0f-b2df-fca4c693cac4:cm4ai_chromatin_mda-mb-468_untreated_ifimage_0.7alpha", - "author": "Cell Maps team", - "url": "https://github.com/idekerlab/cellmaps_utils", - "name": "cellmaps_utils", - "keywords": [ - "CM4AI", - "0.7alpha", - "MDA-MB-468", - "untreated", - "IF microscopy", - "images", - "breast; mammary gland", - "chromatin", - "tools", - "cellmaps_utils" - ], - "description": "CM4AI 0.7alpha MDA-MB-468 untreated IF microscopy images breast; mammary gland chromatin Contains utilities needed by Cell Maps tools", - "dateModified": "2024-10-22", - "version": "0.5.0", - "fileFormat": "py", - "usedByComputation": [], - "associatedPublication": None, - "additionalDocumentation": None, - "filepath": "https://github.com/idekerlab/cellmaps_utils", - "cratePath": rocratePath - } - software = GenerateSoftware(**software_metadata) - - yellowFolder = rocratePath / 'yellow' - yellowFolder.mkdir(exist_ok=True) - - # create 10k identifiers - datasetList = [] - #for i in range(100000): - # fileName = f'B2AI_5_untreated_B5_R5_z01_yellow_{i}.jpg' - # datasetFilePath = yellowFolder / fileName - # datasetFilePath.touch(exist_ok=True) - - for i in range(10000): - fileName = f'B2AI_5_untreated_B5_R5_z01_yellow_{i}.jpg' - datasetMetadata = { - "guid": "322ab5a2-e6a7-4c46-be79-cbf3e9453cde:cm4ai_chromatin_mda-mb-468_untreated_ifimage_0.7alpha", - "name": "B2AI_5_untreated_B5_R5_z01_yellow.jpg yellow channel image", - "keywords": [ - "CM4AI", - "0.7alpha", - "MDA-MB-468", - "untreated", - "IF microscopy", - "images", - "breast; mammary gland", - "chromatin", - "yellow", - "IF", - "image", - "ER (Calreticulin antibody)" - ], - "description": "CM4AI 0.7alpha MDA-MB-468 untreated IF microscopy images breast; mammary gland chromatin IF image file", - "author": "Lundberg Lab", - "datePublished": "2024-10-22", - "version": "0.7alpha", - "dataFormat": "jpg", - "generatedBy": [], - "derivedFrom": [], - "usedBy": [], - "url": None, - "associatedPublication": None, - "additionalDocumentation": None, - "schema": None, - "filepath": f"file:///yellow/{fileName}", - "cratePath": rocratePath - } - dataset = GenerateDataset(**datasetMetadata) - datasetList.append(dataset) - - AppendCrate(rocratePath, datasetList) - - # read in the crate metadata - rocrateMetadataRecord = ReadROCrateMetadata(rocratePath) - rocrateGUIDs = [ elem.guid for elem in rocrateMetadataRecord.metadataGraph] - - # assert that all dataset guids are present - for ds in datasetList: - assert ds.guid in rocrateGUIDs - - computation_metadata = { - "guid": "test guid", - "name": "Image Compression", - "runBy": "Chris Churas", - "command": "./test.sh", - "dateCreated": "10-28-2024", - "description": "A placeholder computation for image compression", - "keywords": ["cm4ai", "image"], - "usedSoftware": software.guid, - "usedDataset": [ds.guid for ds in datasetList], - "generated": None - } - computation = GenerateComputation(**computation_metadata) - AppendCrate(rocratePath, [software, computation]) - - # read in ROCrate - rocrateMetadataRecord = ReadROCrateMetadata(rocratePath) - rocrateGUIDs = [ elem.guid for elem in rocrateMetadataRecord.metadataGraph] - - assert computation.guid in rocrateGUIDs - assert software.guid in rocrateGUIDs - - - + + def setUp(self): + # Create test directory structure + self.rocratePath = pathlib.Path.cwd() / 'tests' / 'data' / 'test_api' + self.rocratePath.mkdir(parents=True, exist_ok=True) + + def tearDown(self): + # Clean up test directory after tests + pass + # if self.rocratePath.exists(): + # shutil.rmtree(self.rocratePath) + + def test_api(self): + # Clean start - safely handle metadata file deletion + metadataFile = self.rocratePath / 'ro-crate-metadata.json' + if metadataFile.exists(): + metadataFile.unlink() + + rocrate_metadata = { + "guid": "ark:59853/UVA/B2AI/rocrate_test", + "name": 'test rocrate', + "organizationName": "UVA", + "projectName": "B2AI", + "description": "Testing ROCrate Model", + "keywords": ["test", "fair"], + "path": self.rocratePath + } + + rocrate = GenerateROCrate(**rocrate_metadata) + + software_metadata = { + "guid": "955cf26c-e3a3-4f0f-b2df-fca4c693cac4:cm4ai_chromatin_mda-mb-468_untreated_ifimage_0.7alpha", + "author": "Cell Maps team", + "url": "https://github.com/idekerlab/cellmaps_utils", + "name": "cellmaps_utils", + "keywords": [ + "CM4AI", + "0.7alpha", + "MDA-MB-468", + "untreated", + "IF microscopy", + "images", + "breast; mammary gland", + "chromatin", + "tools", + "cellmaps_utils" + ], + "description": "CM4AI 0.7alpha MDA-MB-468 untreated IF microscopy images breast; mammary gland chromatin Contains utilities needed by Cell Maps tools", + "dateModified": "2024-10-22", + "version": "0.5.0", + "fileFormat": "py", + "usedByComputation": [], + "associatedPublication": None, + "additionalDocumentation": None, + "filepath": "https://github.com/idekerlab/cellmaps_utils", + "cratePath": self.rocratePath + } + software = GenerateSoftware(**software_metadata) + + yellowFolder = self.rocratePath / 'yellow' + yellowFolder.mkdir(exist_ok=True) + + # Create datasets + datasetList = [] + for i in range(10000): + fileName = f'B2AI_5_untreated_B5_R5_z01_yellow_{i}.jpg' + datasetMetadata = { + "guid": f"322ab5a2-e6a7-4c46-be79-cbf3e9453cde:cm4ai_chromatin_mda-mb-468_untreated_ifimage_0.7alpha_{i}", # Make unique + "name": f"B2AI_5_untreated_B5_R5_z01_yellow_{i}.jpg yellow channel image", + "keywords": [ + "CM4AI", + "0.7alpha", + "MDA-MB-468", + "untreated", + "IF microscopy", + "images", + "breast; mammary gland", + "chromatin", + "yellow", + "IF", + "image", + "ER (Calreticulin antibody)" + ], + "description": "CM4AI 0.7alpha MDA-MB-468 untreated IF microscopy images breast; mammary gland chromatin IF image file", + "author": "Lundberg Lab", + "datePublished": "2024-10-22", + "version": "0.7alpha", + "dataFormat": "jpg", + "generatedBy": [], + "derivedFrom": [], + "usedBy": [], + "url": None, + "associatedPublication": None, + "additionalDocumentation": None, + "schema": None, + "filepath": f"file:///yellow/{fileName}", + "cratePath": self.rocratePath + } + dataset = GenerateDataset(**datasetMetadata) + datasetList.append(dataset) + + AppendCrate(self.rocratePath, datasetList) + + # Verify crate metadata + rocrateMetadataRecord = ReadROCrateMetadata(self.rocratePath) + rocrateGUIDs = [elem.guid for elem in rocrateMetadataRecord.metadataGraph] + + # Verify all dataset GUIDs are present + for ds in datasetList: + self.assertIn(ds.guid, rocrateGUIDs, f"Dataset GUID {ds.guid} not found in metadata") + + computation_metadata = { + "guid": "test-computation-guid", # Made more specific + "name": "Image Compression", + "runBy": "Chris Churas", + "command": "./test.sh", + "dateCreated": "10-28-2024", + "description": "A placeholder computation for image compression", + "keywords": ["cm4ai", "image"], + "usedSoftware": software.guid, + "usedDataset": [ds.guid for ds in datasetList], + "generated": None + } + computation = GenerateComputation(**computation_metadata) + AppendCrate(self.rocratePath, [software, computation]) + + # Final verification + rocrateMetadataRecord = ReadROCrateMetadata(self.rocratePath) + rocrateGUIDs = [elem.guid for elem in rocrateMetadataRecord.metadataGraph] + + self.assertIn(computation.guid, rocrateGUIDs, "Computation GUID not found in metadata") + self.assertIn(software.guid, rocrateGUIDs, "Software GUID not found in metadata") if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() \ No newline at end of file From b1da76e83914017b0feb56e9260e05a71a3625e7 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Wed, 4 Dec 2024 16:16:35 -0500 Subject: [PATCH 05/14] fix 2 vs 3 error --- src/fairscape_cli/models/computation.py | 2 +- src/fairscape_cli/models/dataset.py | 1 - tests/test_compute_stats.py | 72 +++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/src/fairscape_cli/models/computation.py b/src/fairscape_cli/models/computation.py index a4d8a3d..24e67ac 100644 --- a/src/fairscape_cli/models/computation.py +++ b/src/fairscape_cli/models/computation.py @@ -60,7 +60,7 @@ def GenerateComputation( computation_model = Computation.model_validate( { "@id": guid, - "@type": "https://w2id.org/EVI#Computation", + "@type": "https://w3id.org/EVI#Computation", "name": name, "description": description, "keywords": keywords, diff --git a/src/fairscape_cli/models/dataset.py b/src/fairscape_cli/models/dataset.py index bf46cb2..9f662ff 100644 --- a/src/fairscape_cli/models/dataset.py +++ b/src/fairscape_cli/models/dataset.py @@ -82,7 +82,6 @@ def GenerateDataset( "derivedFrom": [derived.strip("\n") for derived in derivedFrom], "usedBy": [used.strip("\n") for used in usedBy], "generatedBy": [gen.strip("\n") for gen in generatedBy], - "contentU" "hasSummaryStatistics": summary_stats_guid } diff --git a/tests/test_compute_stats.py b/tests/test_compute_stats.py index 3375502..e5c5711 100644 --- a/tests/test_compute_stats.py +++ b/tests/test_compute_stats.py @@ -19,11 +19,14 @@ def tearDown(self): # Only remove the generated files, not the entire directory metadata_file = self.test_dir / 'ro-crate-metadata.json' stats_file = self.test_dir / 'summary_stats_numbers.csv' + summary_file = self.test_dir / 'fake_summary.csv' if metadata_file.exists(): metadata_file.unlink() if stats_file.exists(): stats_file.unlink() + if summary_file.exists(): + summary_file.unlink() def run_cli_command(self, command: str) -> Tuple[int, str, str]: """Run a CLI command and return returncode, stdout, stderr""" @@ -128,5 +131,74 @@ def test_cli_workflow(self): self.assertTrue('hasSummaryStatistics' in dataset) self.assertEqual(dataset['hasSummaryStatistics'], computation['generated']) + def test_dataset_with_summary_stats(self): + # Change to test directory + os.chdir(self.test_dir) + + # Initialize ROCrate + init_cmd = '''python -m fairscape_cli rocrate init \ + --name "Dataset Summary Test" \ + --organization-name "Test Organization" \ + --project-name "Summary Stats Test" \ + --description "Testing dataset registration with summary statistics" \ + --keywords "data" --keywords "testing" --keywords "summary-stats"''' + + returncode, stdout, stderr = self.run_cli_command(init_cmd) + self.assertEqual(returncode, 0, f"ROCrate init failed: {stderr}") + rocrate_guid = stdout.strip() + + # Create fake summary file + summary_path = self.test_dir / 'fake_summary.csv' + with open(summary_path, 'w') as f: + f.write("statistic,value\nmean,42.0\nmedian,41.5\nstd,5.2") + + # Register dataset with summary statistics + dataset_cmd = f'''python -m fairscape_cli rocrate register dataset ./ \ + --name "Test Dataset" \ + --author "Test Author" \ + --version "1.0.0" \ + --date-published "{datetime.date.today().isoformat()}" \ + --description "Dataset with pre-existing summary statistics" \ + --keywords "data" --keywords "testing" \ + --data-format "text/csv" \ + --filepath "numbers.csv" \ + --summary-statistics-filepath "fake_summary.csv"''' + + returncode, stdout, stderr = self.run_cli_command(dataset_cmd) + self.assertEqual(returncode, 0, f"Dataset registration failed: {stderr}") + dataset_guid = stdout.strip() + + # Verify the metadata file exists and has correct structure + metadata_file = self.test_dir / 'ro-crate-metadata.json' + self.assertTrue(metadata_file.exists()) + + # Load and verify metadata + with open(metadata_file) as f: + metadata = json.load(f) + + # Find dataset record and verify it has summary statistics + dataset = next(item for item in metadata['@graph'] if item['@id'] == dataset_guid) + + # Get summary stats ID + summary_stats_id = dataset['hasSummaryStatistics'] + + # Find the summary statistics dataset in the graph - with more flexible matching + summary_stats = next( + (item for item in metadata['@graph'] + if 'stats' in item['@id'] and item['@type'] == 'https://w3id.org/EVI#Dataset'), + None + ) + self.assertEqual(summary_stats['@type'], 'https://w3id.org/EVI#Dataset') + self.assertTrue('stats' in summary_stats['@id']) + self.assertEqual(summary_stats['author'], 'Test Author') + + computation = next( + (item for item in metadata['@graph'] + if item['@type'] == 'https://w3id.org/EVI#Computation' and summary_stats_id in item.get('generated', [])), + None + ) + self.assertIsNotNone(computation) + self.assertEqual(computation['usedDataset'], [dataset_guid]) + if __name__ == '__main__': unittest.main() \ No newline at end of file From 77cdd3ec56c247040ef5a17382a47f7280e7afda Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Dec 2024 12:28:11 -0500 Subject: [PATCH 06/14] frictionless draft --- .../models/schema/frictionless_tabular.py | 556 ++++++++++++++++++ .../schema/frictionless_schema.py | 345 +++++++++++ 2 files changed, 901 insertions(+) create mode 100644 src/fairscape_cli/models/schema/frictionless_tabular.py create mode 100644 src/fairscape_cli/schema/frictionless_schema.py diff --git a/src/fairscape_cli/models/schema/frictionless_tabular.py b/src/fairscape_cli/models/schema/frictionless_tabular.py new file mode 100644 index 0000000..6ead41b --- /dev/null +++ b/src/fairscape_cli/models/schema/frictionless_tabular.py @@ -0,0 +1,556 @@ +import pathlib +import os +import json +import pandas as pd +import h5py +from datetime import datetime +from enum import Enum +from pydantic import ( + BaseModel, + ConfigDict, + Field, + ValidationError, + model_validator +) +from typing import ( + Dict, + List, + Optional, + Literal, + Union +) +from frictionless import Schema, Resource, describe, fields + + +from fairscape_cli.models.schema.utils import ( + PropertyNameException, + ColumnIndexException, +) + +from fairscape_cli.config import ( + DEFAULT_CONTEXT, + DEFAULT_SCHEMA_TYPE, + NAAN, +) + +class FileType(str, Enum): + CSV = "csv" + TSV = "tsv" + PARQUET = "parquet" + + @classmethod + def from_extension(cls, filepath: str) -> 'FileType': + ext = pathlib.Path(filepath).suffix.lower()[1:] + if ext == 'parquet': + return cls.PARQUET + elif ext == 'tsv': + return cls.TSV + elif ext == 'csv': + return cls.CSV + else: + raise ValueError(f"Unsupported file extension: {ext}") + +class ValidationError(BaseModel): + message: str + row: Optional[int] = None + field: Optional[str] = None + type: str = "ValidationError" + failed_keyword: str + path: Optional[str] = None + +class DatatypeEnum(str, Enum): + NULL = "null" + BOOLEAN = "boolean" + STRING = "string" + NUMBER = "number" + INTEGER = "integer" + ARRAY = "array" + +class Items(BaseModel): + model_config = ConfigDict( + populate_by_name = True, + use_enum_values=True + ) + datatype: DatatypeEnum = Field(alias="type") + +class BaseProperty(BaseModel): + description: str = Field(description="description of field") + model_config = ConfigDict(populate_by_name = True) + index: Union[int,str] = Field(description="index of the column for this value") + valueURL: Optional[str] = Field(default=None) + +class NullProperty(BaseProperty): + datatype: Literal['null'] = Field(alias="type", default='null') + index: int + +class StringProperty(BaseProperty): + datatype: Literal['string'] = Field(alias="type") + pattern: Optional[str] = Field(description="Regex pattern to execute against values", default=None) + maxLength: Optional[int] = Field(description="Inclusive maximum length for string values", default=None) + minLength: Optional[int] = Field(description="Inclusive minimum length for string values", default=None) + index: int + +class ArrayProperty(BaseProperty): + datatype: Literal['array'] = Field(alias="type") + maxItems: Optional[int] = Field(description="max items in array, validation fails if length is greater than this value", default=None) + minItems: Optional[int] = Field(description="min items in array, validation fails if lenght is shorter than this value", default=None) + uniqueItems: Optional[bool] = Field() + index: str + items: Items + +class BooleanProperty(BaseProperty): + datatype: Literal['boolean'] = Field(alias="type") + index: int + +class NumberProperty(BaseProperty): + datatype: Literal['number'] = Field(alias="type") + maximum: Optional[float] = Field(description="Inclusive Upper Limit for Values", default=None) + minimum: Optional[float] = Field(description="Inclusive Lower Limit for Values", default=None) + index: int + + @model_validator(mode='after') + def check_max_min(self) -> 'NumberProperty': + minimum = self.minimum + maximum = self.maximum + + if maximum is not None and minimum is not None: + if maximum == minimum: + raise ValueError('NumberProperty attribute minimum != maximum') + elif maximum < minimum: + raise ValueError('NumberProperty attribute maximum !< minimum') + return self + +class IntegerProperty(BaseProperty): + datatype: Literal['integer'] = Field(alias="type") + maximum: Optional[int] = Field(description="Inclusive Upper Limit for Values", default=None) + minimum: Optional[int] = Field(description="Inclusive Lower Limit for Values", default=None) + index: int + + @model_validator(mode='after') + def check_max_min(self) -> 'IntegerProperty': + minimum = self.minimum + maximum = self.maximum + + if maximum is not None and minimum is not None: + if maximum == minimum: + raise ValueError('IntegerProperty attribute minimum != maximum') + elif maximum < minimum: + raise ValueError('IntegerProperty attribute maximum !< minimum') + return self + +def frictionless_type_to_json_schema(field_type: str) -> str: + """Convert Frictionless types to JSON Schema types""" + type_mapping = { + 'string': 'string', + 'integer': 'integer', + 'number': 'number', + 'boolean': 'boolean', + 'date': 'string', + 'datetime': 'string', + 'year': 'integer', + 'yearmonth': 'string', + 'duration': 'string', + 'geopoint': 'array', + 'geojson': 'object', + 'array': 'array', + 'object': 'object', + 'time': 'string' + } + return type_mapping.get(field_type, 'string') + +class TabularValidationSchema(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + guid: Optional[str] = Field(alias="@id", default=None) + context: Optional[Dict] = Field(default=DEFAULT_CONTEXT, alias="@context") + metadataType: Optional[str] = Field(default=DEFAULT_SCHEMA_TYPE, alias="@type") + schema_version: str = Field(default="https://json-schema.org/draft/2020-12/schema", alias="$schema") + name: str + description: str + datatype: str = Field(default="object", alias="type") + separator: str = Field(description="Field separator for the file") + header: bool = Field(description="Do files of this schema have a header row", default=True) + required: List[str] = Field(default=[]) + properties: Dict[str, Dict] = Field(default={}) + additionalProperties: bool = Field(default=True) + + # Will store the frictionless schema + _frictionless_schema: Optional[Schema] = None + + def generate_guid(self) -> str: + """Generate a unique identifier for the schema""" + if self.guid is None: + prefix = f"schema-{self.name.lower().replace(' ', '-')}" + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + self.guid = f"ark:{NAAN}/{prefix}-{timestamp}" + return self.guid + + @model_validator(mode='after') + def generate_all_guids(self) -> 'TabularValidationSchema': + """Generate GUIDs for this schema and any nested schemas""" + self.generate_guid() + return self + + @classmethod + def infer_from_file(cls, filepath: str, name: str, description: str) -> 'TabularValidationSchema': + """Infer schema from a file using Frictionless""" + file_type = FileType.from_extension(filepath) + separator = '\t' if file_type == FileType.TSV else ',' + + resource = describe(filepath) + + properties = {} + required_fields = [] + + for i, field in enumerate(resource.schema.fields): + json_schema_type = frictionless_type_to_json_schema(field.type) + + property_def = { + "type": json_schema_type, + "description": field.description or f"Column {field.name}", + "index": i + } + + properties[field.name] = property_def + + if field.required: + required_fields.append(field.name) + + schema = cls( + name=name, + description=description, + separator=separator, + header=True, + properties=properties, + required=required_fields + ) + schema._frictionless_schema = resource.schema + return schema + + def validate_file(self, filepath: str) -> List[ValidationError]: + """Validate a file against the schema using Frictionless""" + if not self._frictionless_schema: + raise ValueError("Schema not properly initialized") + + resource = Resource( + path=os.path.basename(filepath), + basepath=os.path.dirname(filepath), + schema=self._frictionless_schema + ) + report = resource.validate() + + # Convert Frictionless errors to our format + errors = [] + for task in report.tasks: + for error in task.errors: + if isinstance(error, TypeError): + validation_error = ValidationError( + message=str(error), + type="ValidationError", + failed_keyword="type" + ) + else: + validation_error = ValidationError( + message=error.message, + row=error.row_number if hasattr(error, 'row_number') else None, + field=error.field_name if hasattr(error, 'field_name') else None, + failed_keyword=error.code if hasattr(error, 'code') else "error" + ) + errors.append(validation_error) + + return errors + + def to_dict(self) -> dict: + """Convert the schema to a dictionary format""" + return self.model_dump(by_alias=True, exclude={'_frictionless_schema'}) + + @classmethod + def from_dict(cls, data: dict) -> 'TabularValidationSchema': + """Create a schema instance from a dictionary""" + properties = data.pop('properties', {}) + required_fields = data.pop('required', []) + frictionless_schema = Schema() + + # Map JSON Schema types to frictionless field types + type_to_field = { + 'string': fields.StringField, + 'integer': fields.IntegerField, + 'number': fields.NumberField, + 'boolean': fields.BooleanField, + 'array': fields.ArrayField + } + + for name, prop in properties.items(): + field_type = type_to_field.get(prop.get('type', 'string'), fields.StringField) + field = field_type( + name=name, + description=prop.get('description', ''), + constraints={} + ) + + if 'minimum' in prop: + field.constraints['minimum'] = prop['minimum'] + if 'maximum' in prop: + field.constraints['maximum'] = prop['maximum'] + if 'pattern' in prop: + field.constraints['pattern'] = prop['pattern'] + if 'minLength' in prop: + field.constraints['minLength'] = prop['minLength'] + if 'maxLength' in prop: + field.constraints['maxLength'] = prop['maxLength'] + + frictionless_schema.add_field(field) + + # Create our schema instance + schema = cls(**data, properties=properties, required=required_fields) + schema._frictionless_schema = frictionless_schema + return schema + +def read_schema(schema_file: str) -> TabularValidationSchema: + """Read a schema from a file""" + schema_path = pathlib.Path(schema_file) + + if not schema_path.exists(): + raise FileNotFoundError(f"Schema file not found: {schema_file}") + + with schema_path.open('r') as f: + schema_dict = json.load(f) + + return TabularValidationSchema.from_dict(schema_dict) + +def write_schema(schema: TabularValidationSchema, output_file: str): + """Write a schema to a file""" + schema_dict = schema.to_dict() + + with open(output_file, 'w') as f: + json.dump(schema_dict, f, indent=2) + +class HDF5ValidationSchema(BaseModel): + name: str + description: str + properties: Dict[str, TabularValidationSchema] = Field(default={}) + required: List[str] = Field(default=[]) + + @staticmethod + def dataset_to_dataframe(dataset: h5py.Dataset) -> pd.DataFrame: + """Convert an HDF5 dataset to a pandas DataFrame""" + data = dataset[()] + + if dataset.dtype.fields: # Structured array + return pd.DataFrame(data) + elif len(dataset.shape) > 1: # Multi-dimensional array + n_cols = dataset.shape[1] + columns = [f"column_{i}" for i in range(n_cols)] + return pd.DataFrame(data, columns=columns) + else: # 1D array + return pd.DataFrame(data, columns=['value']) + + @classmethod + def infer_from_file(cls, filepath: str, name: str, description: str) -> 'HDF5ValidationSchema': + """Infer schema from an HDF5 file""" + schema = cls( + name=name, + description=description + ) + properties = {} + + with h5py.File(filepath, 'r') as f: + def process_group(group, parent_path=""): + for key, item in group.items(): + path = f"{parent_path}/{key}" if parent_path else key + + if isinstance(item, h5py.Dataset): + try: + df = cls.dataset_to_dataframe(item) + + resource = describe(df) + + tabular_schema = TabularValidationSchema( + name=f"{name}_{path.replace('/', '_')}", + description=f"Dataset at {path}", + separator=",", + header=True, + properties={}, + required=[] + ) + + tabular_schema._frictionless_schema = resource.schema + + for i, field in enumerate(resource.schema.fields): + property_def = { + "type": field.type, + "description": field.description or f"Column {field.name}", + "index": i + } + + if field.constraints: + for key, value in field.constraints.items(): + property_def[key] = value + + tabular_schema.properties[field.name] = property_def + tabular_schema.required.append(field.name) + + properties[path] = tabular_schema + + except Exception as e: + print(f"Warning: Could not process dataset {path}: {str(e)}") + + elif isinstance(item, h5py.Group): + process_group(item, path) + + process_group(f) + schema.properties = properties + schema.required = list(properties.keys()) + + return schema + + def validate_file(self, filepath: str) -> List[ValidationError]: + """Validate an HDF5 file against the schema""" + errors = [] + + with h5py.File(filepath, 'r') as f: + for path, schema in self.properties.items(): + try: + dataset = f[path] + if isinstance(dataset, h5py.Dataset): + df = self.dataset_to_dataframe(dataset) + resource = Resource(data=df, schema=schema._frictionless_schema) + report = resource.validate() + + for task in report.tasks: + for error in task.errors: + # Skip string type errors + if (hasattr(error, 'type') and error.type == 'type-error' and + hasattr(error, 'note') and 'type is "string' in error.note): + continue + + validation_error = ValidationError( + message=error.message, + row=error.rowNumber if hasattr(error, 'rowNumber') else None, + field=error.fieldName if hasattr(error, 'fieldName') else None, + type="ValidationError", + failed_keyword=error.type if hasattr(error, 'type') else "error", + path=path + ) + errors.append(validation_error) + + except KeyError: + errors.append(ValidationError( + message=f"Dataset {path} not found", + type="ValidationError", + failed_keyword="required", + path=path + )) + except Exception as e: + errors.append(ValidationError( + message=f"Error validating dataset {path}: {str(e)}", + type="ValidationError", + failed_keyword="format", + path=path + )) + + return errors + + def to_dict(self) -> dict: + """Convert the schema to a dictionary format""" + return { + "name": self.name, + "description": self.description, + "properties": { + path: schema.to_dict() + for path, schema in self.properties.items() + }, + "required": self.required + } + + @classmethod + def from_dict(cls, data: dict) -> 'HDF5ValidationSchema': + """Create a schema instance from a dictionary""" + properties = { + path: TabularValidationSchema.from_dict(schema_dict) + for path, schema_dict in data.get('properties', {}).items() + } + + return cls( + name=data['name'], + description=data['description'], + properties=properties, + required=data.get('required', []) + ) + +def AppendProperty(schemaFilepath: str, propertyInstance, propertyName: str) -> None: + # check that schemaFile exists + schemaPath = pathlib.Path(schemaFilepath) + + if not schemaPath.exists(): + raise Exception + + with schemaPath.open("r+") as schemaFile: + schemaFileContents = schemaFile.read() + schemaJson = json.loads(schemaFileContents) + + # load the model into a tabular validation schema + schemaModel = TabularValidationSchema.model_validate(schemaJson) + + # TODO check for inconsitencies + + # does there exist a property with same name + if propertyName in [key for key in schemaModel.properties.keys()]: + raise PropertyNameException(propertyName) + + # does there exist a property with same column number + schema_indicies = [ val.index for val in schemaModel.properties.values()] + + # check overlap of indicies + # CheckOverlap + + + # add new property to schema + schemaModel.properties[propertyName] = propertyInstance + + # add new property as required + schemaModel.required.append(propertyName) + + # serialize model to json + schemaJson = json.dumps(schemaModel.model_dump(by_alias=True) , indent=2) + + # overwrite file contents + schemaFile.seek(0) + schemaFile.write(schemaJson) + +def ClickAppendProperty(ctx, schemaFile, propertyModel, name): + try: + # append the property to the + AppendProperty(schemaFile, propertyModel, name) + print(f"Added Property\tname: {name}\ttype: {propertyModel.datatype}") + ctx.exit(code=0) + + except ColumnIndexException as indexException: + print("ERROR: ColumnIndexError") + print(str(indexException)) + ctx.exit(code=1) + except PropertyNameException as propertyException: + print("ERROR: PropertyNameError") + print(str(propertyException)) + ctx.exit(code=1) + +def ReadSchemaGithub(schemaURI: str) -> TabularValidationSchema: + pass + +def ReadSchemaFairscape(schemaArk: str) -> TabularValidationSchema: + pass + +def ReadSchemaLocal(schemaFile: str) -> TabularValidationSchema: + """ Helper function for reading the schema and marshaling into the pydantic model + """ + schemaPath = pathlib.Path(schemaFile) + + # read the schema + with schemaPath.open("r") as inputSchema: + inputSchemaData = inputSchema.read() + schemaJson = json.loads(inputSchemaData) + + # load the model into + tabularSchema = TabularValidationSchema.model_validate(schemaJson) + return tabularSchema + diff --git a/src/fairscape_cli/schema/frictionless_schema.py b/src/fairscape_cli/schema/frictionless_schema.py new file mode 100644 index 0000000..1b315d4 --- /dev/null +++ b/src/fairscape_cli/schema/frictionless_schema.py @@ -0,0 +1,345 @@ +import click +import json +from prettytable import PrettyTable +import pathlib +from pydantic import ( + ValidationError +) +from typing import ( + Union, + Type +) + +from fairscape_cli.models.schema.frictionless_tabular import ( + TabularValidationSchema, + HDF5ValidationSchema, + write_schema as WriteSchema, + read_schema as ReadSchema, + StringProperty, + NumberProperty, + IntegerProperty, + BooleanProperty, + ArrayProperty, + AppendProperty, + ClickAppendProperty, + DatatypeEnum, + Items, + PropertyNameException, + ColumnIndexException +) + +from fairscape_cli.config import ( + FAIRSCAPE_URI +) + +@click.group('schema') +def schema(): + """Invoke operations on dataset schema. + """ + pass + +@schema.command('create-tabular') +@click.option('--name', required=True, type=str) +@click.option('--description', required=True, type=str) +@click.option('--guid', required=False, type=str, default="", show_default=False) +@click.option('--separator', type=str, required=True) +@click.option('--header', required=False, type=bool, default=False) +@click.argument('schema_file', type=str) +@click.pass_context +def create_tabular_schema( + ctx, + name, + description, + guid, + header, + separator, + schema_file +): + """Initialize a Tabular Schema. + """ + try: + schema_model = TabularValidationSchema.model_validate({ + "name": name, + "description":description, + "guid":guid, + "properties":{}, + "required": [], + "header":header, + "separator": separator + }) + + except ValidationError as metadataError: + click.echo("ERROR Validating TabularValidationSchema") + for validationFailure in metadataError.errors(): + click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") + ctx.exit(code=1) + + WriteSchema(schema_model, schema_file) + click.echo(f"Wrote Schema: {str(schema_file)}") + +@schema.group('add-property') +def add_property(): + """Add a Property to an existing schema. + """ + pass + +@add_property.command('string') +@click.option('--name', type=str, required=True) +@click.option('--index', type=int, required=True) +@click.option('--description', type=str, required=True) +@click.option('--value-url', type=str, required=False) +@click.option('--pattern', type=str, required=False) +@click.argument('schema_file', type=click.Path(exists=True)) +@click.pass_context +def add_property_string(ctx, name, index, description, value_url, pattern, schema_file): + """Add a String Property to an existing Schema. + """ + try: + stringPropertyModel = StringProperty.model_validate({ + "name": name, + "index": index, + "type": "string", + "description": description, + "valueURL": value_url, + "pattern": pattern + }) + except ValidationError as metadataError: + click.echo("ERROR Validating StringProperty") + for validationFailure in metadataError.errors(): + click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") + ctx.exit(code=1) + + ClickAppendProperty(ctx, schema_file, stringPropertyModel, name) + +@add_property.command('number') +@click.option('--name', type=str, required=True) +@click.option('--index', type=int, required=True) +@click.option('--description', type=str, required=True) +@click.option('--maximum', type=float, required=False) +@click.option('--minimum', type=float, required=False) +@click.option('--value-url', type=str, required=False) +@click.argument('schema_file', type=click.Path(exists=True)) +@click.pass_context +def add_property_number(ctx, name, index, description, maximum, minimum, value_url, schema_file): + """Add a Numeric property to an existing Schema. + """ + try: + numberPropertyModel = NumberProperty.model_validate({ + "name": name, + "index": index, + "type": "number", + 'maximum': maximum, + 'minimum': minimum, + "description": description, + "valueURL": value_url + }) + except ValidationError as metadataError: + click.echo("ERROR Validating NumberProperty") + for validationFailure in metadataError.errors(): + click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") + ctx.exit(code=1) + + ClickAppendProperty(ctx, schema_file, numberPropertyModel, name) + +@add_property.command('boolean') +@click.option('--name', type=str, required=True) +@click.option('--index', type=int, required=True) +@click.option('--description', type=str, required=True) +@click.option('--value-url', type=str, required=False) +@click.argument('schema_file', type=click.Path(exists=True)) +@click.pass_context +def add_property_boolean(ctx, name, index, description, value_url, schema_file): + """Add a Boolean property to an existing Schema. + """ + try: + booleanPropertyModel = BooleanProperty.model_validate({ + "name": name, + "index": index, + "type": "boolean", + "description": description, + "valueURL": value_url + }) + except ValidationError as metadataError: + click.echo("ERROR Validating BooleanProperty") + for validationFailure in metadataError.errors(): + click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") + ctx.exit(code=1) + + ClickAppendProperty(ctx, schema_file, booleanPropertyModel, name) + +@add_property.command('integer') +@click.option('--name', type=str, required=True) +@click.option('--index', type=int, required=True) +@click.option('--description', type=str, required=True) +@click.option('--maximum', type=int, required=False) +@click.option('--minimum', type=int, required=False) +@click.option('--value-url', type=str, required=False) +@click.argument('schema_file', type=click.Path(exists=True)) +@click.pass_context +def add_property_integer(ctx, name, index, description, maximum, minimum, value_url, schema_file): + """Add an Integer property to an existing Schema. + """ + try: + integerPropertyModel = IntegerProperty.model_validate({ + "name": name, + "index": index, + "type": "integer", + "description": description, + "maximum": maximum, + "minimum": minimum, + "valueURL": value_url + }) + except ValidationError as metadataError: + click.echo("ERROR Validating IntegerProperty") + for validationFailure in metadataError.errors(): + click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") + ctx.exit(code=1) + + ClickAppendProperty(ctx, schema_file, integerPropertyModel, name) + +@add_property.command('array') +@click.option('--name', type=str, required=True) +@click.option('--index', type=str, required=True) +@click.option('--description', type=str, required=True) +@click.option('--value-url', type=str, required=False) +@click.option('--items-datatype', type=str, required=True) +@click.option('--min-items', type=int, required=False) +@click.option('--max-items', type=int, required=False) +@click.option('--unique-items', type=bool, required=False) +@click.argument('schema_file', type=click.Path(exists=True)) +@click.pass_context +def add_property_array(ctx, name, index, description, value_url, items_datatype, min_items, max_items, unique_items, schema_file): + """Add an Array property to an existing Schema. + """ + try: + datatype_enum = DatatypeEnum(items_datatype) + except Exception: + print(f"ITEMS Datatype {items_datatype} invalid\n" + + "ITEMS must be oneOf 'boolean'|'object'|'string'|'number'|'integer'" + ) + ctx.exit(code=1) + + try: + arrayPropertyModel = ArrayProperty( + datatype='array', + index=index, + description=description, + valueURL=value_url, + maxItems=max_items, + minItems=min_items, + uniqueItems=unique_items, + items=Items(datatype=datatype_enum) + ) + except ValidationError as metadataError: + print("ERROR: MetadataValidationError") + for validationFailure in metadataError.errors(): + click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") + ctx.exit(code=1) + + ClickAppendProperty(ctx, schema_file, arrayPropertyModel, name) + +def determine_schema_type(filepath: str) -> Type[Union[TabularValidationSchema, HDF5ValidationSchema]]: + """Determine which schema type to use based on file extension""" + ext = pathlib.Path(filepath).suffix.lower()[1:] + if ext in ('h5', 'hdf5'): + return HDF5ValidationSchema + elif ext in ('csv', 'tsv', 'parquet'): + return TabularValidationSchema + else: + raise ValueError(f"Unsupported file extension: {ext}") + +@schema.command('validate') +@click.option('--schema', type=str, required=True) +@click.option('--data', type=str, required=True) +@click.pass_context +def validate(ctx, schema, data): + """Execute validation of a Schema against the provided data.""" + if 'ark' not in schema: + schema_path = pathlib.Path(schema) + if not schema_path.exists(): + click.echo(f"ERROR: Schema file at path {schema} does not exist") + ctx.exit(1) + + data_path = pathlib.Path(data) + if not data_path.exists(): + click.echo(f"ERROR: Data file at path {data} does not exist") + ctx.exit(1) + + try: + with open(schema) as f: + schema_json = json.load(f) + + schema_class = determine_schema_type(data) + validation_schema = schema_class.model_validate(schema_json) + + validation_errors = validation_schema.validate_file(data) + + if len(validation_errors) != 0: + error_table = PrettyTable() + if isinstance(validation_schema, HDF5ValidationSchema): + error_table.field_names = ['path', 'error_type', 'failed_keyword', 'message'] + else: + error_table.field_names = ['row', 'error_type', 'failed_keyword', 'message'] + + for err in validation_errors: + if isinstance(validation_schema, HDF5ValidationSchema): + error_table.add_row([ + err.get("path"), + err.get("type"), + err.get("failed_keyword"), + str(err.get('message')) + ]) + else: + error_table.add_row([ + err.get("row"), + err.get("type"), + err.get("failed_keyword"), + str(err.get('message')) + ]) + + print(error_table) + ctx.exit(1) + else: + print('Validation Success') + ctx.exit(0) + + except ValidationError as metadata_error: + click.echo("Error with schema definition") + for validation_failure in metadata_error.errors(): + click.echo(f"property: {validation_failure.get('loc')} \tmsg: {validation_failure.get('msg')}") + ctx.exit(1) + except Exception as e: + click.echo(f"Error during validation: {str(e)}") + ctx.exit(1) + +@schema.command('infer') +@click.option('--name', required=True, type=str) +@click.option('--description', required=True, type=str) +@click.option('--guid', required=False, type=str, default="", show_default=False) +@click.argument('input_file', type=click.Path(exists=True)) +@click.argument('schema_file', type=str) +@click.pass_context +def infer_schema(ctx, name, description, guid, input_file, schema_file): + """Infer a schema from a file (CSV, TSV, Parquet, or HDF5).""" + try: + schema_class = determine_schema_type(input_file) + + schema_model = schema_class.infer_from_file( + input_file, + name, + description + ) + if guid: + schema_model.guid = guid + + WriteSchema(schema_model, schema_file) + + ext = pathlib.Path(input_file).suffix.lower()[1:] + click.echo(f"Inferred Schema from {ext} file: {str(schema_file)}") + + except ValueError as e: + click.echo(f"Error with file type: {str(e)}") + ctx.exit(code=1) + except Exception as e: + click.echo(f"Error inferring schema: {str(e)}") + ctx.exit(code=1) \ No newline at end of file From 86c6803613b92ee7466a105b49cc449609d985ed Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Dec 2024 14:16:42 -0500 Subject: [PATCH 07/14] latest --- .../models/schema/frictionless_tabular.py | 136 ++++++++++++------ .../schema/frictionless_schema.py | 32 ++--- 2 files changed, 107 insertions(+), 61 deletions(-) diff --git a/src/fairscape_cli/models/schema/frictionless_tabular.py b/src/fairscape_cli/models/schema/frictionless_tabular.py index 6ead41b..c5afd46 100644 --- a/src/fairscape_cli/models/schema/frictionless_tabular.py +++ b/src/fairscape_cli/models/schema/frictionless_tabular.py @@ -158,6 +158,20 @@ def frictionless_type_to_json_schema(field_type: str) -> str: } return type_mapping.get(field_type, 'string') + +PropertyUnion = Union[StringProperty, ArrayProperty, BooleanProperty, NumberProperty, IntegerProperty, NullProperty] +def frictionless_type_from_property(prop: PropertyUnion) -> str: + """Convert PropertyUnion type to Frictionless field type""" + type_mapping = { + 'string': 'string', + 'integer': 'integer', + 'number': 'number', + 'boolean': 'boolean', + 'array': 'array', + 'null': 'string' # Default to string for null type + } + return type_mapping.get(prop.datatype, 'string') + class TabularValidationSchema(BaseModel): model_config = ConfigDict(populate_by_name=True) @@ -171,10 +185,10 @@ class TabularValidationSchema(BaseModel): separator: str = Field(description="Field separator for the file") header: bool = Field(description="Do files of this schema have a header row", default=True) required: List[str] = Field(default=[]) - properties: Dict[str, Dict] = Field(default={}) + properties: Dict[str, PropertyUnion] = Field(default={}) additionalProperties: bool = Field(default=True) - # Will store the frictionless schema + # Store the frictionless schema _frictionless_schema: Optional[Schema] = None def generate_guid(self) -> str: @@ -198,23 +212,28 @@ def infer_from_file(cls, filepath: str, name: str, description: str) -> 'Tabular separator = '\t' if file_type == FileType.TSV else ',' resource = describe(filepath) - properties = {} - required_fields = [] + required_fields = [] + + type_mapping = { + 'string': (StringProperty, 'string'), + 'integer': (IntegerProperty, 'integer'), + 'number': (NumberProperty, 'number'), + 'boolean': (BooleanProperty, 'boolean'), + 'array': (ArrayProperty, 'array'), + } for i, field in enumerate(resource.schema.fields): - json_schema_type = frictionless_type_to_json_schema(field.type) + property_class, datatype = type_mapping.get(field.type, (StringProperty, 'string')) - property_def = { - "type": json_schema_type, - "description": field.description or f"Column {field.name}", - "index": i - } - - properties[field.name] = property_def + property_def = property_class( + datatype=datatype, + description=field.description or f"Column {field.name}", + index=i + ) - if field.required: - required_fields.append(field.name) + properties[field.name] = property_def + required_fields.append(field.name) schema = cls( name=name, @@ -239,7 +258,6 @@ def validate_file(self, filepath: str) -> List[ValidationError]: ) report = resource.validate() - # Convert Frictionless errors to our format errors = [] for task in report.tasks: for error in task.errors: @@ -269,9 +287,9 @@ def from_dict(cls, data: dict) -> 'TabularValidationSchema': """Create a schema instance from a dictionary""" properties = data.pop('properties', {}) required_fields = data.pop('required', []) + frictionless_schema = Schema() - # Map JSON Schema types to frictionless field types type_to_field = { 'string': fields.StringField, 'integer': fields.IntegerField, @@ -288,6 +306,7 @@ def from_dict(cls, data: dict) -> 'TabularValidationSchema': constraints={} ) + # Add constraints if they exist if 'minimum' in prop: field.constraints['minimum'] = prop['minimum'] if 'maximum' in prop: @@ -318,18 +337,27 @@ def read_schema(schema_file: str) -> TabularValidationSchema: return TabularValidationSchema.from_dict(schema_dict) -def write_schema(schema: TabularValidationSchema, output_file: str): - """Write a schema to a file""" - schema_dict = schema.to_dict() - - with open(output_file, 'w') as f: - json.dump(schema_dict, f, indent=2) - class HDF5ValidationSchema(BaseModel): + guid: Optional[str] = Field(alias="@id", default=None) + context: Optional[Dict] = Field(default=DEFAULT_CONTEXT, alias="@context") name: str description: str properties: Dict[str, TabularValidationSchema] = Field(default={}) required: List[str] = Field(default=[]) + + def generate_guid(self) -> str: + """Generate a unique identifier for the schema""" + if self.guid is None: + prefix = f"schema-{self.name.lower().replace(' ', '-')}" + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + self.guid = f"ark:{NAAN}/{prefix}-{timestamp}" + return self.guid + + @model_validator(mode='after') + def generate_all_guids(self) -> 'HDF5ValidationSchema': + """Generate GUIDs for this schema and any nested schemas""" + self.generate_guid() + return self @staticmethod def dataset_to_dataframe(dataset: h5py.Dataset) -> pd.DataFrame: @@ -354,6 +382,28 @@ def infer_from_file(cls, filepath: str, name: str, description: str) -> 'HDF5Val ) properties = {} + def create_property(field_type: str, field_name: str, index: int, description: str) -> PropertyUnion: + """Helper function to create the correct property type instance""" + base_args = { + "description": description, + "index": index, + } + + if field_type == 'number': + return NumberProperty(datatype='number', **base_args) + elif field_type == 'integer': + return IntegerProperty(datatype='integer', **base_args) + elif field_type == 'boolean': + return BooleanProperty(datatype='boolean', **base_args) + elif field_type == 'array': + return ArrayProperty( + datatype='array', + items=Items(datatype='number'), + **base_args + ) + else: # default to string + return StringProperty(datatype='string', **base_args) + with h5py.File(filepath, 'r') as f: def process_group(group, parent_path=""): for key, item in group.items(): @@ -362,7 +412,6 @@ def process_group(group, parent_path=""): if isinstance(item, h5py.Dataset): try: df = cls.dataset_to_dataframe(item) - resource = describe(df) tabular_schema = TabularValidationSchema( @@ -371,23 +420,21 @@ def process_group(group, parent_path=""): separator=",", header=True, properties={}, - required=[] + required=[], + context=None ) tabular_schema._frictionless_schema = resource.schema for i, field in enumerate(resource.schema.fields): - property_def = { - "type": field.type, - "description": field.description or f"Column {field.name}", - "index": i - } - - if field.constraints: - for key, value in field.constraints.items(): - property_def[key] = value + property_instance = create_property( + field_type=field.type, + field_name=field.name, + index=i, + description=field.description or f"Column {field.name}" + ) - tabular_schema.properties[field.name] = property_def + tabular_schema.properties[field.name] = property_instance tabular_schema.required.append(field.name) properties[path] = tabular_schema @@ -452,16 +499,8 @@ def validate_file(self, filepath: str) -> List[ValidationError]: return errors def to_dict(self) -> dict: - """Convert the schema to a dictionary format""" - return { - "name": self.name, - "description": self.description, - "properties": { - path: schema.to_dict() - for path, schema in self.properties.items() - }, - "required": self.required - } + """Convert the schema to a dictionary format including all fields""" + return self.model_dump(by_alias=True) @classmethod def from_dict(cls, data: dict) -> 'HDF5ValidationSchema': @@ -478,6 +517,13 @@ def from_dict(cls, data: dict) -> 'HDF5ValidationSchema': required=data.get('required', []) ) +def write_schema(schema: TabularValidationSchema, output_file: str): + """Write a schema to a file""" + schema_dict = schema.to_dict() + + with open(output_file, 'w') as f: + json.dump(schema_dict, f, indent=2) + def AppendProperty(schemaFilepath: str, propertyInstance, propertyName: str) -> None: # check that schemaFile exists schemaPath = pathlib.Path(schemaFilepath) diff --git a/src/fairscape_cli/schema/frictionless_schema.py b/src/fairscape_cli/schema/frictionless_schema.py index 1b315d4..a24a693 100644 --- a/src/fairscape_cli/schema/frictionless_schema.py +++ b/src/fairscape_cli/schema/frictionless_schema.py @@ -32,16 +32,16 @@ FAIRSCAPE_URI ) -@click.group('schema') -def schema(): +@click.group('frictionless') +def frictionless(): """Invoke operations on dataset schema. """ pass -@schema.command('create-tabular') +@frictionless.command('create-tabular') @click.option('--name', required=True, type=str) @click.option('--description', required=True, type=str) -@click.option('--guid', required=False, type=str, default="", show_default=False) +@click.option('--guid', required=False, type=str, default=None, show_default=False) @click.option('--separator', type=str, required=True) @click.option('--header', required=False, type=bool, default=False) @click.argument('schema_file', type=str) @@ -77,7 +77,7 @@ def create_tabular_schema( WriteSchema(schema_model, schema_file) click.echo(f"Wrote Schema: {str(schema_file)}") -@schema.group('add-property') +@frictionless.group('add-property') def add_property(): """Add a Property to an existing schema. """ @@ -248,7 +248,7 @@ def determine_schema_type(filepath: str) -> Type[Union[TabularValidationSchema, else: raise ValueError(f"Unsupported file extension: {ext}") -@schema.command('validate') +@frictionless.command('validate') @click.option('--schema', type=str, required=True) @click.option('--data', type=str, required=True) @click.pass_context @@ -270,7 +270,7 @@ def validate(ctx, schema, data): schema_json = json.load(f) schema_class = determine_schema_type(data) - validation_schema = schema_class.model_validate(schema_json) + validation_schema = schema_class.from_dict(schema_json) validation_errors = validation_schema.validate_file(data) @@ -284,17 +284,17 @@ def validate(ctx, schema, data): for err in validation_errors: if isinstance(validation_schema, HDF5ValidationSchema): error_table.add_row([ - err.get("path"), - err.get("type"), - err.get("failed_keyword"), - str(err.get('message')) + err.path, + err.type, + err.failed_keyword, + str(err.message) ]) else: error_table.add_row([ - err.get("row"), - err.get("type"), - err.get("failed_keyword"), - str(err.get('message')) + err.row, + err.type, + err.failed_keyword, + str(err.message) ]) print(error_table) @@ -312,7 +312,7 @@ def validate(ctx, schema, data): click.echo(f"Error during validation: {str(e)}") ctx.exit(1) -@schema.command('infer') +@frictionless.command('infer') @click.option('--name', required=True, type=str) @click.option('--description', required=True, type=str) @click.option('--guid', required=False, type=str, default="", show_default=False) From 922542af53c2e3e7babbe959f2ad940bfe367fc2 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Dec 2024 14:21:53 -0500 Subject: [PATCH 08/14] fix to not include all props if empty --- .../models/schema/frictionless_tabular.py | 34 ++++++++----------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/src/fairscape_cli/models/schema/frictionless_tabular.py b/src/fairscape_cli/models/schema/frictionless_tabular.py index c5afd46..f825301 100644 --- a/src/fairscape_cli/models/schema/frictionless_tabular.py +++ b/src/fairscape_cli/models/schema/frictionless_tabular.py @@ -185,7 +185,7 @@ class TabularValidationSchema(BaseModel): separator: str = Field(description="Field separator for the file") header: bool = Field(description="Do files of this schema have a header row", default=True) required: List[str] = Field(default=[]) - properties: Dict[str, PropertyUnion] = Field(default={}) + properties: Dict[str, Dict] = Field(default={}) additionalProperties: bool = Field(default=True) # Store the frictionless schema @@ -206,35 +206,29 @@ def generate_all_guids(self) -> 'TabularValidationSchema': return self @classmethod - def infer_from_file(cls, filepath: str, name: str, description: str) -> 'TabularValidationSchema': + def infer_from_file(cls, filepath: str, name: str, description: str, include_min_max: bool = False) -> 'TabularValidationSchema': """Infer schema from a file using Frictionless""" file_type = FileType.from_extension(filepath) separator = '\t' if file_type == FileType.TSV else ',' resource = describe(filepath) - properties = {} - required_fields = [] - type_mapping = { - 'string': (StringProperty, 'string'), - 'integer': (IntegerProperty, 'integer'), - 'number': (NumberProperty, 'number'), - 'boolean': (BooleanProperty, 'boolean'), - 'array': (ArrayProperty, 'array'), - } + properties = {} + required_fields = [] for i, field in enumerate(resource.schema.fields): - property_class, datatype = type_mapping.get(field.type, (StringProperty, 'string')) - - property_def = property_class( - datatype=datatype, - description=field.description or f"Column {field.name}", - index=i - ) + json_schema_type = frictionless_type_to_json_schema(field.type) + property_def = { + "type": json_schema_type, + "description": field.description or f"Column {field.name}", + "index": i + } + properties[field.name] = property_def - required_fields.append(field.name) + required_fields.append(field.name) + # Create our schema instance schema = cls( name=name, description=description, @@ -243,6 +237,8 @@ def infer_from_file(cls, filepath: str, name: str, description: str) -> 'Tabular properties=properties, required=required_fields ) + + # Store the frictionless schema for validation schema._frictionless_schema = resource.schema return schema From 267fcc2489a38f5119d57fdf7555c865568f59a5 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Dec 2024 14:27:20 -0500 Subject: [PATCH 09/14] same thing for hdf5 --- .../models/schema/frictionless_tabular.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/fairscape_cli/models/schema/frictionless_tabular.py b/src/fairscape_cli/models/schema/frictionless_tabular.py index f825301..b194b4b 100644 --- a/src/fairscape_cli/models/schema/frictionless_tabular.py +++ b/src/fairscape_cli/models/schema/frictionless_tabular.py @@ -423,14 +423,13 @@ def process_group(group, parent_path=""): tabular_schema._frictionless_schema = resource.schema for i, field in enumerate(resource.schema.fields): - property_instance = create_property( - field_type=field.type, - field_name=field.name, - index=i, - description=field.description or f"Column {field.name}" - ) + property_def = { + "type": field.type, + "description": field.description or f"Column {field.name}", + "index": i + } - tabular_schema.properties[field.name] = property_instance + tabular_schema.properties[field.name] = property_def tabular_schema.required.append(field.name) properties[path] = tabular_schema @@ -457,6 +456,8 @@ def validate_file(self, filepath: str) -> List[ValidationError]: dataset = f[path] if isinstance(dataset, h5py.Dataset): df = self.dataset_to_dataframe(dataset) + print(df) + print(schema._frictionless_schema) resource = Resource(data=df, schema=schema._frictionless_schema) report = resource.validate() From e201bd07f96aa05a76b674267b96b2478a6c603f Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Dec 2024 14:28:54 -0500 Subject: [PATCH 10/14] more changes --- .../models/schema/frictionless_tabular.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/src/fairscape_cli/models/schema/frictionless_tabular.py b/src/fairscape_cli/models/schema/frictionless_tabular.py index b194b4b..181b5bd 100644 --- a/src/fairscape_cli/models/schema/frictionless_tabular.py +++ b/src/fairscape_cli/models/schema/frictionless_tabular.py @@ -377,28 +377,6 @@ def infer_from_file(cls, filepath: str, name: str, description: str) -> 'HDF5Val description=description ) properties = {} - - def create_property(field_type: str, field_name: str, index: int, description: str) -> PropertyUnion: - """Helper function to create the correct property type instance""" - base_args = { - "description": description, - "index": index, - } - - if field_type == 'number': - return NumberProperty(datatype='number', **base_args) - elif field_type == 'integer': - return IntegerProperty(datatype='integer', **base_args) - elif field_type == 'boolean': - return BooleanProperty(datatype='boolean', **base_args) - elif field_type == 'array': - return ArrayProperty( - datatype='array', - items=Items(datatype='number'), - **base_args - ) - else: # default to string - return StringProperty(datatype='string', **base_args) with h5py.File(filepath, 'r') as f: def process_group(group, parent_path=""): From 8344ba65ea141e12cf8d7425468d5e5662c7030d Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Dec 2024 14:34:30 -0500 Subject: [PATCH 11/14] append drop nones --- .../models/schema/frictionless_tabular.py | 27 +++++-------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/src/fairscape_cli/models/schema/frictionless_tabular.py b/src/fairscape_cli/models/schema/frictionless_tabular.py index 181b5bd..beb7fdf 100644 --- a/src/fairscape_cli/models/schema/frictionless_tabular.py +++ b/src/fairscape_cli/models/schema/frictionless_tabular.py @@ -499,43 +499,28 @@ def write_schema(schema: TabularValidationSchema, output_file: str): with open(output_file, 'w') as f: json.dump(schema_dict, f, indent=2) -def AppendProperty(schemaFilepath: str, propertyInstance, propertyName: str) -> None: +def AppendProperty(schemaFilepath: str, propertyInstance, propertyName: str) -> None: # check that schemaFile exists schemaPath = pathlib.Path(schemaFilepath) - if not schemaPath.exists(): raise Exception with schemaPath.open("r+") as schemaFile: schemaFileContents = schemaFile.read() - schemaJson = json.loads(schemaFileContents) + schemaJson = json.loads(schemaFileContents) - # load the model into a tabular validation schema schemaModel = TabularValidationSchema.model_validate(schemaJson) - # TODO check for inconsitencies - - # does there exist a property with same name if propertyName in [key for key in schemaModel.properties.keys()]: raise PropertyNameException(propertyName) - # does there exist a property with same column number - schema_indicies = [ val.index for val in schemaModel.properties.values()] - - # check overlap of indicies - # CheckOverlap - - - # add new property to schema + schema_indicies = [val['index'] for val in schemaModel.properties.values()] + schemaModel.properties[propertyName] = propertyInstance - - # add new property as required schemaModel.required.append(propertyName) + schemaJson = json.dumps(schemaModel.model_dump(by_alias=True, exclude_none=True), indent=2) - # serialize model to json - schemaJson = json.dumps(schemaModel.model_dump(by_alias=True) , indent=2) - - # overwrite file contents + # overwrite file contents schemaFile.seek(0) schemaFile.write(schemaJson) From 01fd75916ebe4c870a2bae55d3bb1facf98fe7dc Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Dec 2024 14:37:05 -0500 Subject: [PATCH 12/14] switch to frictionless --- .../models/schema/frictionless_tabular.py | 562 ---------------- src/fairscape_cli/models/schema/tabular.py | 608 +++++++++--------- .../schema/frictionless_schema.py | 345 ---------- src/fairscape_cli/schema/schema.py | 74 +-- 4 files changed, 314 insertions(+), 1275 deletions(-) delete mode 100644 src/fairscape_cli/models/schema/frictionless_tabular.py delete mode 100644 src/fairscape_cli/schema/frictionless_schema.py diff --git a/src/fairscape_cli/models/schema/frictionless_tabular.py b/src/fairscape_cli/models/schema/frictionless_tabular.py deleted file mode 100644 index beb7fdf..0000000 --- a/src/fairscape_cli/models/schema/frictionless_tabular.py +++ /dev/null @@ -1,562 +0,0 @@ -import pathlib -import os -import json -import pandas as pd -import h5py -from datetime import datetime -from enum import Enum -from pydantic import ( - BaseModel, - ConfigDict, - Field, - ValidationError, - model_validator -) -from typing import ( - Dict, - List, - Optional, - Literal, - Union -) -from frictionless import Schema, Resource, describe, fields - - -from fairscape_cli.models.schema.utils import ( - PropertyNameException, - ColumnIndexException, -) - -from fairscape_cli.config import ( - DEFAULT_CONTEXT, - DEFAULT_SCHEMA_TYPE, - NAAN, -) - -class FileType(str, Enum): - CSV = "csv" - TSV = "tsv" - PARQUET = "parquet" - - @classmethod - def from_extension(cls, filepath: str) -> 'FileType': - ext = pathlib.Path(filepath).suffix.lower()[1:] - if ext == 'parquet': - return cls.PARQUET - elif ext == 'tsv': - return cls.TSV - elif ext == 'csv': - return cls.CSV - else: - raise ValueError(f"Unsupported file extension: {ext}") - -class ValidationError(BaseModel): - message: str - row: Optional[int] = None - field: Optional[str] = None - type: str = "ValidationError" - failed_keyword: str - path: Optional[str] = None - -class DatatypeEnum(str, Enum): - NULL = "null" - BOOLEAN = "boolean" - STRING = "string" - NUMBER = "number" - INTEGER = "integer" - ARRAY = "array" - -class Items(BaseModel): - model_config = ConfigDict( - populate_by_name = True, - use_enum_values=True - ) - datatype: DatatypeEnum = Field(alias="type") - -class BaseProperty(BaseModel): - description: str = Field(description="description of field") - model_config = ConfigDict(populate_by_name = True) - index: Union[int,str] = Field(description="index of the column for this value") - valueURL: Optional[str] = Field(default=None) - -class NullProperty(BaseProperty): - datatype: Literal['null'] = Field(alias="type", default='null') - index: int - -class StringProperty(BaseProperty): - datatype: Literal['string'] = Field(alias="type") - pattern: Optional[str] = Field(description="Regex pattern to execute against values", default=None) - maxLength: Optional[int] = Field(description="Inclusive maximum length for string values", default=None) - minLength: Optional[int] = Field(description="Inclusive minimum length for string values", default=None) - index: int - -class ArrayProperty(BaseProperty): - datatype: Literal['array'] = Field(alias="type") - maxItems: Optional[int] = Field(description="max items in array, validation fails if length is greater than this value", default=None) - minItems: Optional[int] = Field(description="min items in array, validation fails if lenght is shorter than this value", default=None) - uniqueItems: Optional[bool] = Field() - index: str - items: Items - -class BooleanProperty(BaseProperty): - datatype: Literal['boolean'] = Field(alias="type") - index: int - -class NumberProperty(BaseProperty): - datatype: Literal['number'] = Field(alias="type") - maximum: Optional[float] = Field(description="Inclusive Upper Limit for Values", default=None) - minimum: Optional[float] = Field(description="Inclusive Lower Limit for Values", default=None) - index: int - - @model_validator(mode='after') - def check_max_min(self) -> 'NumberProperty': - minimum = self.minimum - maximum = self.maximum - - if maximum is not None and minimum is not None: - if maximum == minimum: - raise ValueError('NumberProperty attribute minimum != maximum') - elif maximum < minimum: - raise ValueError('NumberProperty attribute maximum !< minimum') - return self - -class IntegerProperty(BaseProperty): - datatype: Literal['integer'] = Field(alias="type") - maximum: Optional[int] = Field(description="Inclusive Upper Limit for Values", default=None) - minimum: Optional[int] = Field(description="Inclusive Lower Limit for Values", default=None) - index: int - - @model_validator(mode='after') - def check_max_min(self) -> 'IntegerProperty': - minimum = self.minimum - maximum = self.maximum - - if maximum is not None and minimum is not None: - if maximum == minimum: - raise ValueError('IntegerProperty attribute minimum != maximum') - elif maximum < minimum: - raise ValueError('IntegerProperty attribute maximum !< minimum') - return self - -def frictionless_type_to_json_schema(field_type: str) -> str: - """Convert Frictionless types to JSON Schema types""" - type_mapping = { - 'string': 'string', - 'integer': 'integer', - 'number': 'number', - 'boolean': 'boolean', - 'date': 'string', - 'datetime': 'string', - 'year': 'integer', - 'yearmonth': 'string', - 'duration': 'string', - 'geopoint': 'array', - 'geojson': 'object', - 'array': 'array', - 'object': 'object', - 'time': 'string' - } - return type_mapping.get(field_type, 'string') - - -PropertyUnion = Union[StringProperty, ArrayProperty, BooleanProperty, NumberProperty, IntegerProperty, NullProperty] -def frictionless_type_from_property(prop: PropertyUnion) -> str: - """Convert PropertyUnion type to Frictionless field type""" - type_mapping = { - 'string': 'string', - 'integer': 'integer', - 'number': 'number', - 'boolean': 'boolean', - 'array': 'array', - 'null': 'string' # Default to string for null type - } - return type_mapping.get(prop.datatype, 'string') - -class TabularValidationSchema(BaseModel): - model_config = ConfigDict(populate_by_name=True) - - guid: Optional[str] = Field(alias="@id", default=None) - context: Optional[Dict] = Field(default=DEFAULT_CONTEXT, alias="@context") - metadataType: Optional[str] = Field(default=DEFAULT_SCHEMA_TYPE, alias="@type") - schema_version: str = Field(default="https://json-schema.org/draft/2020-12/schema", alias="$schema") - name: str - description: str - datatype: str = Field(default="object", alias="type") - separator: str = Field(description="Field separator for the file") - header: bool = Field(description="Do files of this schema have a header row", default=True) - required: List[str] = Field(default=[]) - properties: Dict[str, Dict] = Field(default={}) - additionalProperties: bool = Field(default=True) - - # Store the frictionless schema - _frictionless_schema: Optional[Schema] = None - - def generate_guid(self) -> str: - """Generate a unique identifier for the schema""" - if self.guid is None: - prefix = f"schema-{self.name.lower().replace(' ', '-')}" - timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - self.guid = f"ark:{NAAN}/{prefix}-{timestamp}" - return self.guid - - @model_validator(mode='after') - def generate_all_guids(self) -> 'TabularValidationSchema': - """Generate GUIDs for this schema and any nested schemas""" - self.generate_guid() - return self - - @classmethod - def infer_from_file(cls, filepath: str, name: str, description: str, include_min_max: bool = False) -> 'TabularValidationSchema': - """Infer schema from a file using Frictionless""" - file_type = FileType.from_extension(filepath) - separator = '\t' if file_type == FileType.TSV else ',' - - resource = describe(filepath) - - properties = {} - required_fields = [] - - for i, field in enumerate(resource.schema.fields): - json_schema_type = frictionless_type_to_json_schema(field.type) - - property_def = { - "type": json_schema_type, - "description": field.description or f"Column {field.name}", - "index": i - } - - properties[field.name] = property_def - required_fields.append(field.name) - - # Create our schema instance - schema = cls( - name=name, - description=description, - separator=separator, - header=True, - properties=properties, - required=required_fields - ) - - # Store the frictionless schema for validation - schema._frictionless_schema = resource.schema - return schema - - def validate_file(self, filepath: str) -> List[ValidationError]: - """Validate a file against the schema using Frictionless""" - if not self._frictionless_schema: - raise ValueError("Schema not properly initialized") - - resource = Resource( - path=os.path.basename(filepath), - basepath=os.path.dirname(filepath), - schema=self._frictionless_schema - ) - report = resource.validate() - - errors = [] - for task in report.tasks: - for error in task.errors: - if isinstance(error, TypeError): - validation_error = ValidationError( - message=str(error), - type="ValidationError", - failed_keyword="type" - ) - else: - validation_error = ValidationError( - message=error.message, - row=error.row_number if hasattr(error, 'row_number') else None, - field=error.field_name if hasattr(error, 'field_name') else None, - failed_keyword=error.code if hasattr(error, 'code') else "error" - ) - errors.append(validation_error) - - return errors - - def to_dict(self) -> dict: - """Convert the schema to a dictionary format""" - return self.model_dump(by_alias=True, exclude={'_frictionless_schema'}) - - @classmethod - def from_dict(cls, data: dict) -> 'TabularValidationSchema': - """Create a schema instance from a dictionary""" - properties = data.pop('properties', {}) - required_fields = data.pop('required', []) - - frictionless_schema = Schema() - - type_to_field = { - 'string': fields.StringField, - 'integer': fields.IntegerField, - 'number': fields.NumberField, - 'boolean': fields.BooleanField, - 'array': fields.ArrayField - } - - for name, prop in properties.items(): - field_type = type_to_field.get(prop.get('type', 'string'), fields.StringField) - field = field_type( - name=name, - description=prop.get('description', ''), - constraints={} - ) - - # Add constraints if they exist - if 'minimum' in prop: - field.constraints['minimum'] = prop['minimum'] - if 'maximum' in prop: - field.constraints['maximum'] = prop['maximum'] - if 'pattern' in prop: - field.constraints['pattern'] = prop['pattern'] - if 'minLength' in prop: - field.constraints['minLength'] = prop['minLength'] - if 'maxLength' in prop: - field.constraints['maxLength'] = prop['maxLength'] - - frictionless_schema.add_field(field) - - # Create our schema instance - schema = cls(**data, properties=properties, required=required_fields) - schema._frictionless_schema = frictionless_schema - return schema - -def read_schema(schema_file: str) -> TabularValidationSchema: - """Read a schema from a file""" - schema_path = pathlib.Path(schema_file) - - if not schema_path.exists(): - raise FileNotFoundError(f"Schema file not found: {schema_file}") - - with schema_path.open('r') as f: - schema_dict = json.load(f) - - return TabularValidationSchema.from_dict(schema_dict) - -class HDF5ValidationSchema(BaseModel): - guid: Optional[str] = Field(alias="@id", default=None) - context: Optional[Dict] = Field(default=DEFAULT_CONTEXT, alias="@context") - name: str - description: str - properties: Dict[str, TabularValidationSchema] = Field(default={}) - required: List[str] = Field(default=[]) - - def generate_guid(self) -> str: - """Generate a unique identifier for the schema""" - if self.guid is None: - prefix = f"schema-{self.name.lower().replace(' ', '-')}" - timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - self.guid = f"ark:{NAAN}/{prefix}-{timestamp}" - return self.guid - - @model_validator(mode='after') - def generate_all_guids(self) -> 'HDF5ValidationSchema': - """Generate GUIDs for this schema and any nested schemas""" - self.generate_guid() - return self - - @staticmethod - def dataset_to_dataframe(dataset: h5py.Dataset) -> pd.DataFrame: - """Convert an HDF5 dataset to a pandas DataFrame""" - data = dataset[()] - - if dataset.dtype.fields: # Structured array - return pd.DataFrame(data) - elif len(dataset.shape) > 1: # Multi-dimensional array - n_cols = dataset.shape[1] - columns = [f"column_{i}" for i in range(n_cols)] - return pd.DataFrame(data, columns=columns) - else: # 1D array - return pd.DataFrame(data, columns=['value']) - - @classmethod - def infer_from_file(cls, filepath: str, name: str, description: str) -> 'HDF5ValidationSchema': - """Infer schema from an HDF5 file""" - schema = cls( - name=name, - description=description - ) - properties = {} - - with h5py.File(filepath, 'r') as f: - def process_group(group, parent_path=""): - for key, item in group.items(): - path = f"{parent_path}/{key}" if parent_path else key - - if isinstance(item, h5py.Dataset): - try: - df = cls.dataset_to_dataframe(item) - resource = describe(df) - - tabular_schema = TabularValidationSchema( - name=f"{name}_{path.replace('/', '_')}", - description=f"Dataset at {path}", - separator=",", - header=True, - properties={}, - required=[], - context=None - ) - - tabular_schema._frictionless_schema = resource.schema - - for i, field in enumerate(resource.schema.fields): - property_def = { - "type": field.type, - "description": field.description or f"Column {field.name}", - "index": i - } - - tabular_schema.properties[field.name] = property_def - tabular_schema.required.append(field.name) - - properties[path] = tabular_schema - - except Exception as e: - print(f"Warning: Could not process dataset {path}: {str(e)}") - - elif isinstance(item, h5py.Group): - process_group(item, path) - - process_group(f) - schema.properties = properties - schema.required = list(properties.keys()) - - return schema - - def validate_file(self, filepath: str) -> List[ValidationError]: - """Validate an HDF5 file against the schema""" - errors = [] - - with h5py.File(filepath, 'r') as f: - for path, schema in self.properties.items(): - try: - dataset = f[path] - if isinstance(dataset, h5py.Dataset): - df = self.dataset_to_dataframe(dataset) - print(df) - print(schema._frictionless_schema) - resource = Resource(data=df, schema=schema._frictionless_schema) - report = resource.validate() - - for task in report.tasks: - for error in task.errors: - # Skip string type errors - if (hasattr(error, 'type') and error.type == 'type-error' and - hasattr(error, 'note') and 'type is "string' in error.note): - continue - - validation_error = ValidationError( - message=error.message, - row=error.rowNumber if hasattr(error, 'rowNumber') else None, - field=error.fieldName if hasattr(error, 'fieldName') else None, - type="ValidationError", - failed_keyword=error.type if hasattr(error, 'type') else "error", - path=path - ) - errors.append(validation_error) - - except KeyError: - errors.append(ValidationError( - message=f"Dataset {path} not found", - type="ValidationError", - failed_keyword="required", - path=path - )) - except Exception as e: - errors.append(ValidationError( - message=f"Error validating dataset {path}: {str(e)}", - type="ValidationError", - failed_keyword="format", - path=path - )) - - return errors - - def to_dict(self) -> dict: - """Convert the schema to a dictionary format including all fields""" - return self.model_dump(by_alias=True) - - @classmethod - def from_dict(cls, data: dict) -> 'HDF5ValidationSchema': - """Create a schema instance from a dictionary""" - properties = { - path: TabularValidationSchema.from_dict(schema_dict) - for path, schema_dict in data.get('properties', {}).items() - } - - return cls( - name=data['name'], - description=data['description'], - properties=properties, - required=data.get('required', []) - ) - -def write_schema(schema: TabularValidationSchema, output_file: str): - """Write a schema to a file""" - schema_dict = schema.to_dict() - - with open(output_file, 'w') as f: - json.dump(schema_dict, f, indent=2) - -def AppendProperty(schemaFilepath: str, propertyInstance, propertyName: str) -> None: - # check that schemaFile exists - schemaPath = pathlib.Path(schemaFilepath) - if not schemaPath.exists(): - raise Exception - - with schemaPath.open("r+") as schemaFile: - schemaFileContents = schemaFile.read() - schemaJson = json.loads(schemaFileContents) - - schemaModel = TabularValidationSchema.model_validate(schemaJson) - - if propertyName in [key for key in schemaModel.properties.keys()]: - raise PropertyNameException(propertyName) - - schema_indicies = [val['index'] for val in schemaModel.properties.values()] - - schemaModel.properties[propertyName] = propertyInstance - schemaModel.required.append(propertyName) - schemaJson = json.dumps(schemaModel.model_dump(by_alias=True, exclude_none=True), indent=2) - - # overwrite file contents - schemaFile.seek(0) - schemaFile.write(schemaJson) - -def ClickAppendProperty(ctx, schemaFile, propertyModel, name): - try: - # append the property to the - AppendProperty(schemaFile, propertyModel, name) - print(f"Added Property\tname: {name}\ttype: {propertyModel.datatype}") - ctx.exit(code=0) - - except ColumnIndexException as indexException: - print("ERROR: ColumnIndexError") - print(str(indexException)) - ctx.exit(code=1) - except PropertyNameException as propertyException: - print("ERROR: PropertyNameError") - print(str(propertyException)) - ctx.exit(code=1) - -def ReadSchemaGithub(schemaURI: str) -> TabularValidationSchema: - pass - -def ReadSchemaFairscape(schemaArk: str) -> TabularValidationSchema: - pass - -def ReadSchemaLocal(schemaFile: str) -> TabularValidationSchema: - """ Helper function for reading the schema and marshaling into the pydantic model - """ - schemaPath = pathlib.Path(schemaFile) - - # read the schema - with schemaPath.open("r") as inputSchema: - inputSchemaData = inputSchema.read() - schemaJson = json.loads(inputSchemaData) - - # load the model into - tabularSchema = TabularValidationSchema.model_validate(schemaJson) - return tabularSchema - diff --git a/src/fairscape_cli/models/schema/tabular.py b/src/fairscape_cli/models/schema/tabular.py index 205c372..1b6df4a 100644 --- a/src/fairscape_cli/models/schema/tabular.py +++ b/src/fairscape_cli/models/schema/tabular.py @@ -1,17 +1,13 @@ -import jsonschema import pathlib -from functools import lru_cache import os import json import pandas as pd -import pyarrow.parquet as pq -import pyarrow.compute as pc import h5py +from datetime import datetime from enum import Enum from pydantic import ( BaseModel, ConfigDict, - computed_field, Field, ValidationError, model_validator @@ -19,21 +15,16 @@ from typing import ( Dict, List, - Optional, - Union, + Optional, Literal, - Type + Union ) +from frictionless import Schema, Resource, describe, fields + from fairscape_cli.models.schema.utils import ( - GenerateSlice, PropertyNameException, ColumnIndexException, - map_arrow_type_to_json_schema -) - -from fairscape_cli.models.guid_utils import ( - GenerateDatetimeSquid ) from fairscape_cli.config import ( @@ -46,14 +37,11 @@ class FileType(str, Enum): CSV = "csv" TSV = "tsv" PARQUET = "parquet" - HDF5 = "h5" @classmethod def from_extension(cls, filepath: str) -> 'FileType': - ext = pathlib.Path(filepath).suffix.lower()[1:] # Remove the dot - if ext == 'h5' or ext == 'hdf5': - return cls.HDF5 - elif ext == 'parquet': + ext = pathlib.Path(filepath).suffix.lower()[1:] + if ext == 'parquet': return cls.PARQUET elif ext == 'tsv': return cls.TSV @@ -62,6 +50,14 @@ def from_extension(cls, filepath: str) -> 'FileType': else: raise ValueError(f"Unsupported file extension: {ext}") +class ValidationError(BaseModel): + message: str + row: Optional[int] = None + field: Optional[str] = None + type: str = "ValidationError" + failed_keyword: str + path: Optional[str] = None + class DatatypeEnum(str, Enum): NULL = "null" BOOLEAN = "boolean" @@ -142,250 +138,246 @@ def check_max_min(self) -> 'IntegerProperty': raise ValueError('IntegerProperty attribute maximum !< minimum') return self -class BaseSchema(BaseModel): +def frictionless_type_to_json_schema(field_type: str) -> str: + """Convert Frictionless types to JSON Schema types""" + type_mapping = { + 'string': 'string', + 'integer': 'integer', + 'number': 'number', + 'boolean': 'boolean', + 'date': 'string', + 'datetime': 'string', + 'year': 'integer', + 'yearmonth': 'string', + 'duration': 'string', + 'geopoint': 'array', + 'geojson': 'object', + 'array': 'array', + 'object': 'object', + 'time': 'string' + } + return type_mapping.get(field_type, 'string') + + +PropertyUnion = Union[StringProperty, ArrayProperty, BooleanProperty, NumberProperty, IntegerProperty, NullProperty] +def frictionless_type_from_property(prop: PropertyUnion) -> str: + """Convert PropertyUnion type to Frictionless field type""" + type_mapping = { + 'string': 'string', + 'integer': 'integer', + 'number': 'number', + 'boolean': 'boolean', + 'array': 'array', + 'null': 'string' # Default to string for null type + } + return type_mapping.get(prop.datatype, 'string') + +class TabularValidationSchema(BaseModel): + model_config = ConfigDict(populate_by_name=True) + guid: Optional[str] = Field(alias="@id", default=None) context: Optional[Dict] = Field(default=DEFAULT_CONTEXT, alias="@context") metadataType: Optional[str] = Field(default=DEFAULT_SCHEMA_TYPE, alias="@type") - schema_version: str = Field(default="https://json-schema.org/draft/2020-12/schema", alias="schema") + schema_version: str = Field(default="https://json-schema.org/draft/2020-12/schema", alias="$schema") name: str description: str datatype: str = Field(default="object", alias="type") + separator: str = Field(description="Field separator for the file") + header: bool = Field(description="Do files of this schema have a header row", default=True) + required: List[str] = Field(default=[]) + properties: Dict[str, Dict] = Field(default={}) additionalProperties: bool = Field(default=True) - required: List[str] = Field(description="list of required properties by name", default=[]) - examples: Optional[List[Dict[str, str]]] = Field(default=[]) + + # Store the frictionless schema + _frictionless_schema: Optional[Schema] = None def generate_guid(self) -> str: + """Generate a unique identifier for the schema""" if self.guid is None: prefix = f"schema-{self.name.lower().replace(' ', '-')}" - sq = GenerateDatetimeSquid() - self.guid = f"ark:{NAAN}/{prefix}-{sq}" + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + self.guid = f"ark:{NAAN}/{prefix}-{timestamp}" return self.guid - + @model_validator(mode='after') - def generate_all_guids(self) -> 'BaseSchema': + def generate_all_guids(self) -> 'TabularValidationSchema': """Generate GUIDs for this schema and any nested schemas""" self.generate_guid() - - # Generate GUIDs for any nested schemas in properties - if hasattr(self, 'properties'): - for prop in self.properties.values(): - if isinstance(prop, BaseSchema): - prop.generate_guid() - return self - - def to_json_schema(self) -> dict: - """Convert the HDF5Schema to JSON Schema format""" - schema = self.model_dump( - by_alias=True, - exclude_unset=True, - exclude_none=True - ) - return schema - -PropertyUnion = Union[StringProperty, ArrayProperty, BooleanProperty, NumberProperty, IntegerProperty, NullProperty] -class TabularValidationSchema(BaseSchema): - properties: Dict[str, PropertyUnion] = Field(default={}) - separator: str = Field(description="Field separator for the file") - header: bool = Field(description="Do files of this schema have a header row", default=False) @classmethod def infer_from_file(cls, filepath: str, name: str, description: str, include_min_max: bool = False) -> 'TabularValidationSchema': - """Infer schema from a file""" + """Infer schema from a file using Frictionless""" file_type = FileType.from_extension(filepath) + separator = '\t' if file_type == FileType.TSV else ',' - if file_type == FileType.PARQUET: - return cls.infer_from_parquet(name, description, None, filepath, include_min_max) - else: # csv or tsv - separator = '\t' if file_type == FileType.TSV else ',' - df = pd.read_csv(filepath, sep=separator) - return cls.infer_from_dataframe(df, name, description, include_min_max, separator) - - @classmethod - def infer_from_dataframe(cls, df: pd.DataFrame, name: str, description: str, include_min_max: bool = False, separator: str = ',') -> 'TabularValidationSchema': - """Infer schema from a pandas DataFrame""" - type_map = { - 'int16': ('integer', IntegerProperty, int), - 'int32': ('integer', IntegerProperty, int), - 'int64': ('integer', IntegerProperty, int), - 'uint8': ('integer', IntegerProperty, int), - 'uint16': ('integer', IntegerProperty, int), - 'uint32': ('integer', IntegerProperty, int), - 'uint64': ('integer', IntegerProperty, int), - 'float16': ('number', NumberProperty, float), - 'float32': ('number', NumberProperty, float), - 'float64': ('number', NumberProperty, float), - 'bool': ('boolean', BooleanProperty, None), - } + resource = describe(filepath) properties = {} - for i, (column_name, dtype) in enumerate(df.dtypes.items()): - dtype_str = str(dtype) - datatype, property_class, converter = type_map.get(dtype_str, ('string', StringProperty, None)) + required_fields = [] + + for i, field in enumerate(resource.schema.fields): + json_schema_type = frictionless_type_to_json_schema(field.type) - kwargs = { - "datatype": datatype, - "description": f"Column {column_name}", + property_def = { + "type": json_schema_type, + "description": field.description or f"Column {field.name}", "index": i } - - if include_min_max and converter: - kwargs.update({ - "minimum": converter(df[column_name].min()), - "maximum": converter(df[column_name].max()) - }) - - properties[column_name] = property_class(**kwargs) + + properties[field.name] = property_def + required_fields.append(field.name) - return cls( + # Create our schema instance + schema = cls( name=name, description=description, - properties=properties, - required=list(properties.keys()), separator=separator, - header=True + header=True, + properties=properties, + required=required_fields ) + + # Store the frictionless schema for validation + schema._frictionless_schema = resource.schema + return schema - @classmethod - def infer_from_parquet(cls, name: str, description: str, guid: Optional[str], filepath: str, include_min_max: bool = False) -> 'TabularValidationSchema': - """Infer schema from a Parquet file""" - table = pq.read_table(filepath) - schema = table.schema - properties = {} - - for i, field in enumerate(schema): - field_name = field.name - field_type = map_arrow_type_to_json_schema(field.type) - - if field_type == 'string': - properties[field_name] = StringProperty( - datatype='string', - description=f"Column {field_name}", - index=i - ) - elif field_type == 'integer': - if include_min_max: - column = table.column(field_name) - min_max = pc.min_max(column) - properties[field_name] = IntegerProperty( - datatype='integer', - description=f"Column {field_name}", - index=i, - minimum=min_max['min'].as_py(), - maximum=min_max['max'].as_py() - ) - else: - properties[field_name] = IntegerProperty( - datatype='integer', - description=f"Column {field_name}", - index=i - ) - elif field_type == 'number': - if include_min_max: - column = table.column(field_name) - min_max = pc.min_max(column) - properties[field_name] = NumberProperty( - datatype='number', - description=f"Column {field_name}", - index=i, - minimum=min_max['min'].as_py(), - maximum=min_max['max'].as_py() + def validate_file(self, filepath: str) -> List[ValidationError]: + """Validate a file against the schema using Frictionless""" + if not self._frictionless_schema: + raise ValueError("Schema not properly initialized") + + resource = Resource( + path=os.path.basename(filepath), + basepath=os.path.dirname(filepath), + schema=self._frictionless_schema + ) + report = resource.validate() + + errors = [] + for task in report.tasks: + for error in task.errors: + if isinstance(error, TypeError): + validation_error = ValidationError( + message=str(error), + type="ValidationError", + failed_keyword="type" ) else: - properties[field_name] = NumberProperty( - datatype='number', - description=f"Column {field_name}", - index=i + validation_error = ValidationError( + message=error.message, + row=error.row_number if hasattr(error, 'row_number') else None, + field=error.field_name if hasattr(error, 'field_name') else None, + failed_keyword=error.code if hasattr(error, 'code') else "error" ) - elif field_type == 'boolean': - properties[field_name] = BooleanProperty( - datatype='boolean', - description=f"Column {field_name}", - index=i - ) + errors.append(validation_error) + + return errors - return cls( - name=name, - description=description, - guid=guid, - properties=properties, - required=list(properties.keys()), - separator=",", # Not used for parquet but required - header=True # Not used for parquet but required - ) + def to_dict(self) -> dict: + """Convert the schema to a dictionary format""" + return self.model_dump(by_alias=True, exclude={'_frictionless_schema'}) - def validate_file(self, filepath: str) -> List[Dict]: - """Validate a file against the schema""" - file_type = FileType.from_extension(filepath) + @classmethod + def from_dict(cls, data: dict) -> 'TabularValidationSchema': + """Create a schema instance from a dictionary""" + properties = data.pop('properties', {}) + required_fields = data.pop('required', []) - if file_type == FileType.PARQUET: - df = pd.read_parquet(filepath) - else: # csv or tsv - sep = '\t' if file_type == FileType.TSV else self.separator - df = pd.read_csv(filepath, sep=sep, header=0 if self.header else None) + frictionless_schema = Schema() - return self.validate_dataframe(df) - - def validate_dataframe(self, df: pd.DataFrame) -> List[Dict]: - """Validate a dataframe against the schema with lenient string type checking. - Only reports string validation errors for pattern mismatches, not type mismatches.""" - json_schema = self.to_json_schema() - validator = jsonschema.Draft202012Validator(json_schema) - errors = [] - - for i, row in df.iterrows(): - row_dict = row.to_dict() - validation_errors = sorted(validator.iter_errors(row_dict), key=lambda e: e.path) + type_to_field = { + 'string': fields.StringField, + 'integer': fields.IntegerField, + 'number': fields.NumberField, + 'boolean': fields.BooleanField, + 'array': fields.ArrayField + } + + for name, prop in properties.items(): + field_type = type_to_field.get(prop.get('type', 'string'), fields.StringField) + field = field_type( + name=name, + description=prop.get('description', ''), + constraints={} + ) - for err in validation_errors: - # Skip type validation errors for string fields unless there's a pattern mismatch - if err.validator == "type": - field_name = list(err.path)[-1] if err.path else None - if field_name in self.properties: - prop = self.properties[field_name] - if prop.datatype == "string": - # Skip string type validation errors - continue + # Add constraints if they exist + if 'minimum' in prop: + field.constraints['minimum'] = prop['minimum'] + if 'maximum' in prop: + field.constraints['maximum'] = prop['maximum'] + if 'pattern' in prop: + field.constraints['pattern'] = prop['pattern'] + if 'minLength' in prop: + field.constraints['minLength'] = prop['minLength'] + if 'maxLength' in prop: + field.constraints['maxLength'] = prop['maxLength'] - # Include all other validation errors - errors.append({ - "message": err.message, - "row": i, - "field": list(err.path)[-1] if err.path else None, - "type": "ValidationError", - "failed_keyword": err.validator - }) - - return errors + frictionless_schema.add_field(field) + + # Create our schema instance + schema = cls(**data, properties=properties, required=required_fields) + schema._frictionless_schema = frictionless_schema + return schema + +def read_schema(schema_file: str) -> TabularValidationSchema: + """Read a schema from a file""" + schema_path = pathlib.Path(schema_file) + + if not schema_path.exists(): + raise FileNotFoundError(f"Schema file not found: {schema_file}") + + with schema_path.open('r') as f: + schema_dict = json.load(f) + + return TabularValidationSchema.from_dict(schema_dict) -class HDF5Schema(BaseSchema): +class HDF5ValidationSchema(BaseModel): + guid: Optional[str] = Field(alias="@id", default=None) + context: Optional[Dict] = Field(default=DEFAULT_CONTEXT, alias="@context") + name: str + description: str properties: Dict[str, TabularValidationSchema] = Field(default={}) + required: List[str] = Field(default=[]) + def generate_guid(self) -> str: + """Generate a unique identifier for the schema""" + if self.guid is None: + prefix = f"schema-{self.name.lower().replace(' ', '-')}" + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + self.guid = f"ark:{NAAN}/{prefix}-{timestamp}" + return self.guid + + @model_validator(mode='after') + def generate_all_guids(self) -> 'HDF5ValidationSchema': + """Generate GUIDs for this schema and any nested schemas""" + self.generate_guid() + return self + @staticmethod def dataset_to_dataframe(dataset: h5py.Dataset) -> pd.DataFrame: - """Convert any HDF5 dataset to a pandas DataFrame""" + """Convert an HDF5 dataset to a pandas DataFrame""" data = dataset[()] - # structured array convert directly - if dataset.dtype.fields: + if dataset.dtype.fields: # Structured array return pd.DataFrame(data) - - # For multi-dimensional arrays make up column name - elif len(dataset.shape) > 1: - n_cols = dataset.shape[1] if len(dataset.shape) > 1 else 1 + elif len(dataset.shape) > 1: # Multi-dimensional array + n_cols = dataset.shape[1] columns = [f"column_{i}" for i in range(n_cols)] return pd.DataFrame(data, columns=columns) - - # For 1D arrays convert to single column DataFrame - else: + else: # 1D array return pd.DataFrame(data, columns=['value']) - @classmethod - def infer_from_file(cls, filepath: str, name: str, description: str, include_min_max: bool = False) -> 'HDF5Schema': - """Infer schema from HDF5 file""" - schema = cls(name=name, description=description) + @classmethod + def infer_from_file(cls, filepath: str, name: str, description: str) -> 'HDF5ValidationSchema': + """Infer schema from an HDF5 file""" + schema = cls( + name=name, + description=description + ) properties = {} - + with h5py.File(filepath, 'r') as f: def process_group(group, parent_path=""): for key, item in group.items(): @@ -394,98 +386,139 @@ def process_group(group, parent_path=""): if isinstance(item, h5py.Dataset): try: df = cls.dataset_to_dataframe(item) - properties[path] = TabularValidationSchema.infer_from_dataframe( - df, + resource = describe(df) + + tabular_schema = TabularValidationSchema( name=f"{name}_{path.replace('/', '_')}", description=f"Dataset at {path}", - include_min_max=include_min_max + separator=",", + header=True, + properties={}, + required=[], + context=None ) + + tabular_schema._frictionless_schema = resource.schema + + for i, field in enumerate(resource.schema.fields): + property_def = { + "type": field.type, + "description": field.description or f"Column {field.name}", + "index": i + } + + tabular_schema.properties[field.name] = property_def + tabular_schema.required.append(field.name) + + properties[path] = tabular_schema + except Exception as e: - print(f"Warning: Could not convert dataset {path} to DataFrame: {str(e)}") + print(f"Warning: Could not process dataset {path}: {str(e)}") elif isinstance(item, h5py.Group): - # Recursively process group contents process_group(item, path) - + process_group(f) schema.properties = properties schema.required = list(properties.keys()) return schema - def validate_file(self, filepath: str) -> List[Dict]: + def validate_file(self, filepath: str) -> List[ValidationError]: """Validate an HDF5 file against the schema""" errors = [] with h5py.File(filepath, 'r') as f: for path, schema in self.properties.items(): try: - # Try to get the dataset using the path dataset = f[path] if isinstance(dataset, h5py.Dataset): - # Convert dataset to DataFrame df = self.dataset_to_dataframe(dataset) - # Validate using the TabularValidationSchema's validate_dataframe method - dataset_errors = schema.validate_dataframe(df) - # Add path information to errors - for error in dataset_errors: - error['path'] = path - errors.extend(dataset_errors) + resource = Resource(data=df, schema=schema._frictionless_schema) + report = resource.validate() + + for task in report.tasks: + for error in task.errors: + # Skip string type errors + if (hasattr(error, 'type') and error.type == 'type-error' and + hasattr(error, 'note') and 'type is "string' in error.note): + continue + + validation_error = ValidationError( + message=error.message, + row=error.rowNumber if hasattr(error, 'rowNumber') else None, + field=error.fieldName if hasattr(error, 'fieldName') else None, + type="ValidationError", + failed_keyword=error.type if hasattr(error, 'type') else "error", + path=path + ) + errors.append(validation_error) + except KeyError: - errors.append({ - "message": f"Dataset {path} not found", - "path": path, - "type": "ValidationError", - "failed_keyword": "required" - }) + errors.append(ValidationError( + message=f"Dataset {path} not found", + type="ValidationError", + failed_keyword="required", + path=path + )) except Exception as e: - errors.append({ - "message": f"Error validating dataset {path}: {str(e)}", - "path": path, - "type": "ValidationError", - "failed_keyword": "format" - }) + errors.append(ValidationError( + message=f"Error validating dataset {path}: {str(e)}", + type="ValidationError", + failed_keyword="format", + path=path + )) return errors + + def to_dict(self) -> dict: + """Convert the schema to a dictionary format including all fields""" + return self.model_dump(by_alias=True) + @classmethod + def from_dict(cls, data: dict) -> 'HDF5ValidationSchema': + """Create a schema instance from a dictionary""" + properties = { + path: TabularValidationSchema.from_dict(schema_dict) + for path, schema_dict in data.get('properties', {}).items() + } + + return cls( + name=data['name'], + description=data['description'], + properties=properties, + required=data.get('required', []) + ) + +def write_schema(schema: TabularValidationSchema, output_file: str): + """Write a schema to a file""" + schema_dict = schema.to_dict() -def AppendProperty(schemaFilepath: str, propertyInstance, propertyName: str) -> None: + with open(output_file, 'w') as f: + json.dump(schema_dict, f, indent=2) + +def AppendProperty(schemaFilepath: str, propertyInstance, propertyName: str) -> None: # check that schemaFile exists schemaPath = pathlib.Path(schemaFilepath) - if not schemaPath.exists(): raise Exception with schemaPath.open("r+") as schemaFile: schemaFileContents = schemaFile.read() - schemaJson = json.loads(schemaFileContents) + schemaJson = json.loads(schemaFileContents) - # load the model into a tabular validation schema schemaModel = TabularValidationSchema.model_validate(schemaJson) - # TODO check for inconsitencies - - # does there exist a property with same name if propertyName in [key for key in schemaModel.properties.keys()]: raise PropertyNameException(propertyName) - # does there exist a property with same column number - schema_indicies = [ val.index for val in schemaModel.properties.values()] - - # check overlap of indicies - # CheckOverlap - - - # add new property to schema + schema_indicies = [val['index'] for val in schemaModel.properties.values()] + schemaModel.properties[propertyName] = propertyInstance - - # add new property as required schemaModel.required.append(propertyName) + schemaJson = json.dumps(schemaModel.model_dump(by_alias=True, exclude_none=True), indent=2) - # serialize model to json - schemaJson = json.dumps(schemaModel.model_dump(by_alias=True) , indent=2) - - # overwrite file contents + # overwrite file contents schemaFile.seek(0) schemaFile.write(schemaJson) @@ -525,64 +558,3 @@ def ReadSchemaLocal(schemaFile: str) -> TabularValidationSchema: tabularSchema = TabularValidationSchema.model_validate(schemaJson) return tabularSchema -def ReadSchema(schemaFile:str) -> TabularValidationSchema: - ''' Read a schema specified by the argument schemaFile - - The schemaFile parameter can be a url to a rawgithub link, or an ark identifier. - If the ark identifier is in the supplied, default schemas provided in the fairscape cli pacakges will be searched. - If there is no match then - ''' - - if 'raw.githubusercontent' in schemaFile: - schemaInstance = ReadSchemaGithub(schemaFile) - return schemaInstance - - - elif 'ark' in schemaFile: - defaultSchemas = ImportDefaultSchemas() - matchingSchemas = list(filter(lambda schema: schema.guid == str(schemaFile), defaultSchemas)) - - if len(matchingSchemas) == 0: - # request against fairscape - schemaInstance = ReadSchemaFairscape(schemaFile) - return schemaInstance - else: - defaultSchema = matchingSchemas[0] - return defaultSchema - - else: - # schema must be a path that exists - schemaInstance = ReadSchemaLocal(schemaFile) - return schemaInstance - -def WriteSchema(tabular_schema: TabularValidationSchema, schema_file): - """ Helper Function for writing files - """ - - schema_dictionary = tabular_schema.model_dump(by_alias=True) - schema_json = json.dumps(schema_dictionary, indent=2) - - # dump json to a file - with open(schema_file, "w") as output_file: - output_file.write(schema_json) - -@lru_cache -def ImportDefaultSchemas()-> List[TabularValidationSchema]: - defaultSchemaLocation = pathlib.Path(os.path.dirname(os.path.realpath(__file__))) / 'default_schemas' - schemaPaths = list(defaultSchemaLocation.rglob("*/*.json")) - - defaultSchemaList = [] - for schemaPathElem in schemaPaths: - - with schemaPathElem.open("r") as inputSchema: - inputSchemaData = inputSchema.read() - schemaJson = json.loads(inputSchemaData) - - try: - schemaElem = TabularValidationSchema.model_validate(schemaJson) - defaultSchemaList.append(schemaElem) - except: - # TODO handle validation failures from default schemas - pass - - return defaultSchemaList diff --git a/src/fairscape_cli/schema/frictionless_schema.py b/src/fairscape_cli/schema/frictionless_schema.py deleted file mode 100644 index a24a693..0000000 --- a/src/fairscape_cli/schema/frictionless_schema.py +++ /dev/null @@ -1,345 +0,0 @@ -import click -import json -from prettytable import PrettyTable -import pathlib -from pydantic import ( - ValidationError -) -from typing import ( - Union, - Type -) - -from fairscape_cli.models.schema.frictionless_tabular import ( - TabularValidationSchema, - HDF5ValidationSchema, - write_schema as WriteSchema, - read_schema as ReadSchema, - StringProperty, - NumberProperty, - IntegerProperty, - BooleanProperty, - ArrayProperty, - AppendProperty, - ClickAppendProperty, - DatatypeEnum, - Items, - PropertyNameException, - ColumnIndexException -) - -from fairscape_cli.config import ( - FAIRSCAPE_URI -) - -@click.group('frictionless') -def frictionless(): - """Invoke operations on dataset schema. - """ - pass - -@frictionless.command('create-tabular') -@click.option('--name', required=True, type=str) -@click.option('--description', required=True, type=str) -@click.option('--guid', required=False, type=str, default=None, show_default=False) -@click.option('--separator', type=str, required=True) -@click.option('--header', required=False, type=bool, default=False) -@click.argument('schema_file', type=str) -@click.pass_context -def create_tabular_schema( - ctx, - name, - description, - guid, - header, - separator, - schema_file -): - """Initialize a Tabular Schema. - """ - try: - schema_model = TabularValidationSchema.model_validate({ - "name": name, - "description":description, - "guid":guid, - "properties":{}, - "required": [], - "header":header, - "separator": separator - }) - - except ValidationError as metadataError: - click.echo("ERROR Validating TabularValidationSchema") - for validationFailure in metadataError.errors(): - click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") - ctx.exit(code=1) - - WriteSchema(schema_model, schema_file) - click.echo(f"Wrote Schema: {str(schema_file)}") - -@frictionless.group('add-property') -def add_property(): - """Add a Property to an existing schema. - """ - pass - -@add_property.command('string') -@click.option('--name', type=str, required=True) -@click.option('--index', type=int, required=True) -@click.option('--description', type=str, required=True) -@click.option('--value-url', type=str, required=False) -@click.option('--pattern', type=str, required=False) -@click.argument('schema_file', type=click.Path(exists=True)) -@click.pass_context -def add_property_string(ctx, name, index, description, value_url, pattern, schema_file): - """Add a String Property to an existing Schema. - """ - try: - stringPropertyModel = StringProperty.model_validate({ - "name": name, - "index": index, - "type": "string", - "description": description, - "valueURL": value_url, - "pattern": pattern - }) - except ValidationError as metadataError: - click.echo("ERROR Validating StringProperty") - for validationFailure in metadataError.errors(): - click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") - ctx.exit(code=1) - - ClickAppendProperty(ctx, schema_file, stringPropertyModel, name) - -@add_property.command('number') -@click.option('--name', type=str, required=True) -@click.option('--index', type=int, required=True) -@click.option('--description', type=str, required=True) -@click.option('--maximum', type=float, required=False) -@click.option('--minimum', type=float, required=False) -@click.option('--value-url', type=str, required=False) -@click.argument('schema_file', type=click.Path(exists=True)) -@click.pass_context -def add_property_number(ctx, name, index, description, maximum, minimum, value_url, schema_file): - """Add a Numeric property to an existing Schema. - """ - try: - numberPropertyModel = NumberProperty.model_validate({ - "name": name, - "index": index, - "type": "number", - 'maximum': maximum, - 'minimum': minimum, - "description": description, - "valueURL": value_url - }) - except ValidationError as metadataError: - click.echo("ERROR Validating NumberProperty") - for validationFailure in metadataError.errors(): - click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") - ctx.exit(code=1) - - ClickAppendProperty(ctx, schema_file, numberPropertyModel, name) - -@add_property.command('boolean') -@click.option('--name', type=str, required=True) -@click.option('--index', type=int, required=True) -@click.option('--description', type=str, required=True) -@click.option('--value-url', type=str, required=False) -@click.argument('schema_file', type=click.Path(exists=True)) -@click.pass_context -def add_property_boolean(ctx, name, index, description, value_url, schema_file): - """Add a Boolean property to an existing Schema. - """ - try: - booleanPropertyModel = BooleanProperty.model_validate({ - "name": name, - "index": index, - "type": "boolean", - "description": description, - "valueURL": value_url - }) - except ValidationError as metadataError: - click.echo("ERROR Validating BooleanProperty") - for validationFailure in metadataError.errors(): - click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") - ctx.exit(code=1) - - ClickAppendProperty(ctx, schema_file, booleanPropertyModel, name) - -@add_property.command('integer') -@click.option('--name', type=str, required=True) -@click.option('--index', type=int, required=True) -@click.option('--description', type=str, required=True) -@click.option('--maximum', type=int, required=False) -@click.option('--minimum', type=int, required=False) -@click.option('--value-url', type=str, required=False) -@click.argument('schema_file', type=click.Path(exists=True)) -@click.pass_context -def add_property_integer(ctx, name, index, description, maximum, minimum, value_url, schema_file): - """Add an Integer property to an existing Schema. - """ - try: - integerPropertyModel = IntegerProperty.model_validate({ - "name": name, - "index": index, - "type": "integer", - "description": description, - "maximum": maximum, - "minimum": minimum, - "valueURL": value_url - }) - except ValidationError as metadataError: - click.echo("ERROR Validating IntegerProperty") - for validationFailure in metadataError.errors(): - click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") - ctx.exit(code=1) - - ClickAppendProperty(ctx, schema_file, integerPropertyModel, name) - -@add_property.command('array') -@click.option('--name', type=str, required=True) -@click.option('--index', type=str, required=True) -@click.option('--description', type=str, required=True) -@click.option('--value-url', type=str, required=False) -@click.option('--items-datatype', type=str, required=True) -@click.option('--min-items', type=int, required=False) -@click.option('--max-items', type=int, required=False) -@click.option('--unique-items', type=bool, required=False) -@click.argument('schema_file', type=click.Path(exists=True)) -@click.pass_context -def add_property_array(ctx, name, index, description, value_url, items_datatype, min_items, max_items, unique_items, schema_file): - """Add an Array property to an existing Schema. - """ - try: - datatype_enum = DatatypeEnum(items_datatype) - except Exception: - print(f"ITEMS Datatype {items_datatype} invalid\n" + - "ITEMS must be oneOf 'boolean'|'object'|'string'|'number'|'integer'" - ) - ctx.exit(code=1) - - try: - arrayPropertyModel = ArrayProperty( - datatype='array', - index=index, - description=description, - valueURL=value_url, - maxItems=max_items, - minItems=min_items, - uniqueItems=unique_items, - items=Items(datatype=datatype_enum) - ) - except ValidationError as metadataError: - print("ERROR: MetadataValidationError") - for validationFailure in metadataError.errors(): - click.echo(f"property: {validationFailure.get('loc')} \tmsg: {validationFailure.get('msg')}") - ctx.exit(code=1) - - ClickAppendProperty(ctx, schema_file, arrayPropertyModel, name) - -def determine_schema_type(filepath: str) -> Type[Union[TabularValidationSchema, HDF5ValidationSchema]]: - """Determine which schema type to use based on file extension""" - ext = pathlib.Path(filepath).suffix.lower()[1:] - if ext in ('h5', 'hdf5'): - return HDF5ValidationSchema - elif ext in ('csv', 'tsv', 'parquet'): - return TabularValidationSchema - else: - raise ValueError(f"Unsupported file extension: {ext}") - -@frictionless.command('validate') -@click.option('--schema', type=str, required=True) -@click.option('--data', type=str, required=True) -@click.pass_context -def validate(ctx, schema, data): - """Execute validation of a Schema against the provided data.""" - if 'ark' not in schema: - schema_path = pathlib.Path(schema) - if not schema_path.exists(): - click.echo(f"ERROR: Schema file at path {schema} does not exist") - ctx.exit(1) - - data_path = pathlib.Path(data) - if not data_path.exists(): - click.echo(f"ERROR: Data file at path {data} does not exist") - ctx.exit(1) - - try: - with open(schema) as f: - schema_json = json.load(f) - - schema_class = determine_schema_type(data) - validation_schema = schema_class.from_dict(schema_json) - - validation_errors = validation_schema.validate_file(data) - - if len(validation_errors) != 0: - error_table = PrettyTable() - if isinstance(validation_schema, HDF5ValidationSchema): - error_table.field_names = ['path', 'error_type', 'failed_keyword', 'message'] - else: - error_table.field_names = ['row', 'error_type', 'failed_keyword', 'message'] - - for err in validation_errors: - if isinstance(validation_schema, HDF5ValidationSchema): - error_table.add_row([ - err.path, - err.type, - err.failed_keyword, - str(err.message) - ]) - else: - error_table.add_row([ - err.row, - err.type, - err.failed_keyword, - str(err.message) - ]) - - print(error_table) - ctx.exit(1) - else: - print('Validation Success') - ctx.exit(0) - - except ValidationError as metadata_error: - click.echo("Error with schema definition") - for validation_failure in metadata_error.errors(): - click.echo(f"property: {validation_failure.get('loc')} \tmsg: {validation_failure.get('msg')}") - ctx.exit(1) - except Exception as e: - click.echo(f"Error during validation: {str(e)}") - ctx.exit(1) - -@frictionless.command('infer') -@click.option('--name', required=True, type=str) -@click.option('--description', required=True, type=str) -@click.option('--guid', required=False, type=str, default="", show_default=False) -@click.argument('input_file', type=click.Path(exists=True)) -@click.argument('schema_file', type=str) -@click.pass_context -def infer_schema(ctx, name, description, guid, input_file, schema_file): - """Infer a schema from a file (CSV, TSV, Parquet, or HDF5).""" - try: - schema_class = determine_schema_type(input_file) - - schema_model = schema_class.infer_from_file( - input_file, - name, - description - ) - if guid: - schema_model.guid = guid - - WriteSchema(schema_model, schema_file) - - ext = pathlib.Path(input_file).suffix.lower()[1:] - click.echo(f"Inferred Schema from {ext} file: {str(schema_file)}") - - except ValueError as e: - click.echo(f"Error with file type: {str(e)}") - ctx.exit(code=1) - except Exception as e: - click.echo(f"Error inferring schema: {str(e)}") - ctx.exit(code=1) \ No newline at end of file diff --git a/src/fairscape_cli/schema/schema.py b/src/fairscape_cli/schema/schema.py index e17b9d3..4ccf428 100644 --- a/src/fairscape_cli/schema/schema.py +++ b/src/fairscape_cli/schema/schema.py @@ -3,49 +3,45 @@ from prettytable import PrettyTable import pathlib from pydantic import ( - ValidationError + ValidationError ) from typing import ( Union, Type ) - from fairscape_cli.models.schema.tabular import ( TabularValidationSchema, - ReadSchema, - ImportDefaultSchemas, - WriteSchema, + HDF5ValidationSchema, + write_schema as WriteSchema, + read_schema as ReadSchema, StringProperty, NumberProperty, IntegerProperty, BooleanProperty, ArrayProperty, + AppendProperty, ClickAppendProperty, - PropertyNameException, - ColumnIndexException, DatatypeEnum, Items, - FileType, - HDF5Schema + PropertyNameException, + ColumnIndexException ) from fairscape_cli.config import ( FAIRSCAPE_URI ) - @click.group('schema') def schema(): """Invoke operations on dataset schema. """ pass - @schema.command('create-tabular') @click.option('--name', required=True, type=str) @click.option('--description', required=True, type=str) -@click.option('--guid', required=False, type=str, default="", show_default=False) +@click.option('--guid', required=False, type=str, default=None, show_default=False) @click.option('--separator', type=str, required=True) @click.option('--header', required=False, type=bool, default=False) @click.argument('schema_file', type=str) @@ -61,7 +57,6 @@ def create_tabular_schema( ): """Initialize a Tabular Schema. """ - # create the model try: schema_model = TabularValidationSchema.model_validate({ "name": name, @@ -80,8 +75,7 @@ def create_tabular_schema( ctx.exit(code=1) WriteSchema(schema_model, schema_file) - click.echo(f"Wrote Schema: {str(schema_file)}") - + click.echo(f"Wrote Schema: {str(schema_file)}") @schema.group('add-property') def add_property(): @@ -89,7 +83,6 @@ def add_property(): """ pass - @add_property.command('string') @click.option('--name', type=str, required=True) @click.option('--index', type=int, required=True) @@ -118,7 +111,6 @@ def add_property_string(ctx, name, index, description, value_url, pattern, schem ClickAppendProperty(ctx, schema_file, stringPropertyModel, name) - @add_property.command('number') @click.option('--name', type=str, required=True) @click.option('--index', type=int, required=True) @@ -141,7 +133,6 @@ def add_property_number(ctx, name, index, description, maximum, minimum, value_u "description": description, "valueURL": value_url }) - except ValidationError as metadataError: click.echo("ERROR Validating NumberProperty") for validationFailure in metadataError.errors(): @@ -150,7 +141,6 @@ def add_property_number(ctx, name, index, description, maximum, minimum, value_u ClickAppendProperty(ctx, schema_file, numberPropertyModel, name) - @add_property.command('boolean') @click.option('--name', type=str, required=True) @click.option('--index', type=int, required=True) @@ -169,7 +159,6 @@ def add_property_boolean(ctx, name, index, description, value_url, schema_file): "description": description, "valueURL": value_url }) - except ValidationError as metadataError: click.echo("ERROR Validating BooleanProperty") for validationFailure in metadataError.errors(): @@ -178,7 +167,6 @@ def add_property_boolean(ctx, name, index, description, value_url, schema_file): ClickAppendProperty(ctx, schema_file, booleanPropertyModel, name) - @add_property.command('integer') @click.option('--name', type=str, required=True) @click.option('--index', type=int, required=True) @@ -201,7 +189,6 @@ def add_property_integer(ctx, name, index, description, maximum, minimum, value_ "minimum": minimum, "valueURL": value_url }) - except ValidationError as metadataError: click.echo("ERROR Validating IntegerProperty") for validationFailure in metadataError.errors(): @@ -210,7 +197,6 @@ def add_property_integer(ctx, name, index, description, maximum, minimum, value_ ClickAppendProperty(ctx, schema_file, integerPropertyModel, name) - @add_property.command('array') @click.option('--name', type=str, required=True) @click.option('--index', type=str, required=True) @@ -244,7 +230,6 @@ def add_property_array(ctx, name, index, description, value_url, items_datatype, uniqueItems=unique_items, items=Items(datatype=datatype_enum) ) - except ValidationError as metadataError: print("ERROR: MetadataValidationError") for validationFailure in metadataError.errors(): @@ -253,12 +238,11 @@ def add_property_array(ctx, name, index, description, value_url, items_datatype, ClickAppendProperty(ctx, schema_file, arrayPropertyModel, name) - -def determine_schema_type(filepath: str) -> Type[Union[TabularValidationSchema, HDF5Schema]]: +def determine_schema_type(filepath: str) -> Type[Union[TabularValidationSchema, HDF5ValidationSchema]]: """Determine which schema type to use based on file extension""" ext = pathlib.Path(filepath).suffix.lower()[1:] if ext in ('h5', 'hdf5'): - return HDF5Schema + return HDF5ValidationSchema elif ext in ('csv', 'tsv', 'parquet'): return TabularValidationSchema else: @@ -270,7 +254,6 @@ def determine_schema_type(filepath: str) -> Type[Union[TabularValidationSchema, @click.pass_context def validate(ctx, schema, data): """Execute validation of a Schema against the provided data.""" - # Check if schema file exists (if not a default schema) if 'ark' not in schema: schema_path = pathlib.Path(schema) if not schema_path.exists(): @@ -283,39 +266,35 @@ def validate(ctx, schema, data): ctx.exit(1) try: - # Load the schema file with open(schema) as f: schema_json = json.load(f) - # Determine schema type based on the data file schema_class = determine_schema_type(data) - validation_schema = schema_class.model_validate(schema_json) + validation_schema = schema_class.from_dict(schema_json) - # Validate the file validation_errors = validation_schema.validate_file(data) if len(validation_errors) != 0: - # Create a pretty table of validation errors error_table = PrettyTable() - if isinstance(validation_schema, HDF5Schema): + if isinstance(validation_schema, HDF5ValidationSchema): error_table.field_names = ['path', 'error_type', 'failed_keyword', 'message'] else: error_table.field_names = ['row', 'error_type', 'failed_keyword', 'message'] for err in validation_errors: - if isinstance(validation_schema, HDF5Schema): + if isinstance(validation_schema, HDF5ValidationSchema): error_table.add_row([ - err.get("path"), - err.get("type"), - err.get("failed_keyword"), - str(err.get('message')) + err.path, + err.type, + err.failed_keyword, + str(err.message) ]) else: error_table.add_row([ - err.get("row"), - err.get("type"), - err.get("failed_keyword"), - str(err.get('message')) + err.row, + err.type, + err.failed_keyword, + str(err.message) ]) print(error_table) @@ -337,29 +316,24 @@ def validate(ctx, schema, data): @click.option('--name', required=True, type=str) @click.option('--description', required=True, type=str) @click.option('--guid', required=False, type=str, default="", show_default=False) -@click.option('--include-min-max', is_flag=True, help="Include min and max values for numeric and integer fields") @click.argument('input_file', type=click.Path(exists=True)) @click.argument('schema_file', type=str) @click.pass_context -def infer_schema(ctx, name, description, guid, include_min_max, input_file, schema_file): +def infer_schema(ctx, name, description, guid, input_file, schema_file): """Infer a schema from a file (CSV, TSV, Parquet, or HDF5).""" try: - # Determine which schema type to use based on input file schema_class = determine_schema_type(input_file) - # Infer the schema schema_model = schema_class.infer_from_file( input_file, name, - description, - include_min_max + description ) if guid: schema_model.guid = guid WriteSchema(schema_model, schema_file) - # Get file type for display ext = pathlib.Path(input_file).suffix.lower()[1:] click.echo(f"Inferred Schema from {ext} file: {str(schema_file)}") From ed3aab0353d845652c90eb1e66e0cfcf979b3ad2 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Dec 2024 16:05:19 -0500 Subject: [PATCH 13/14] extra code --- src/fairscape_cli/models/schema/tabular.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/fairscape_cli/models/schema/tabular.py b/src/fairscape_cli/models/schema/tabular.py index 1b6df4a..09e8600 100644 --- a/src/fairscape_cli/models/schema/tabular.py +++ b/src/fairscape_cli/models/schema/tabular.py @@ -158,20 +158,6 @@ def frictionless_type_to_json_schema(field_type: str) -> str: } return type_mapping.get(field_type, 'string') - -PropertyUnion = Union[StringProperty, ArrayProperty, BooleanProperty, NumberProperty, IntegerProperty, NullProperty] -def frictionless_type_from_property(prop: PropertyUnion) -> str: - """Convert PropertyUnion type to Frictionless field type""" - type_mapping = { - 'string': 'string', - 'integer': 'integer', - 'number': 'number', - 'boolean': 'boolean', - 'array': 'array', - 'null': 'string' # Default to string for null type - } - return type_mapping.get(prop.datatype, 'string') - class TabularValidationSchema(BaseModel): model_config = ConfigDict(populate_by_name=True) From 480544e10157fd12d5be7a6d91242121f18dd6e5 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Dec 2024 16:10:25 -0500 Subject: [PATCH 14/14] drop extra code --- src/fairscape_cli/models/schema/tabular.py | 12 ------------ src/fairscape_cli/schema/schema.py | 8 -------- 2 files changed, 20 deletions(-) diff --git a/src/fairscape_cli/models/schema/tabular.py b/src/fairscape_cli/models/schema/tabular.py index 09e8600..fda4551 100644 --- a/src/fairscape_cli/models/schema/tabular.py +++ b/src/fairscape_cli/models/schema/tabular.py @@ -306,18 +306,6 @@ def from_dict(cls, data: dict) -> 'TabularValidationSchema': schema = cls(**data, properties=properties, required=required_fields) schema._frictionless_schema = frictionless_schema return schema - -def read_schema(schema_file: str) -> TabularValidationSchema: - """Read a schema from a file""" - schema_path = pathlib.Path(schema_file) - - if not schema_path.exists(): - raise FileNotFoundError(f"Schema file not found: {schema_file}") - - with schema_path.open('r') as f: - schema_dict = json.load(f) - - return TabularValidationSchema.from_dict(schema_dict) class HDF5ValidationSchema(BaseModel): guid: Optional[str] = Field(alias="@id", default=None) diff --git a/src/fairscape_cli/schema/schema.py b/src/fairscape_cli/schema/schema.py index 4ccf428..a09cf2b 100644 --- a/src/fairscape_cli/schema/schema.py +++ b/src/fairscape_cli/schema/schema.py @@ -14,22 +14,14 @@ TabularValidationSchema, HDF5ValidationSchema, write_schema as WriteSchema, - read_schema as ReadSchema, StringProperty, NumberProperty, IntegerProperty, BooleanProperty, ArrayProperty, - AppendProperty, ClickAppendProperty, DatatypeEnum, Items, - PropertyNameException, - ColumnIndexException -) - -from fairscape_cli.config import ( - FAIRSCAPE_URI ) @click.group('schema')