diff --git a/src/fairscape_cli/models/__init__.py b/src/fairscape_cli/models/__init__.py index bdc45ab..41a2ef8 100644 --- a/src/fairscape_cli/models/__init__.py +++ b/src/fairscape_cli/models/__init__.py @@ -1,6 +1,8 @@ from fairscape_cli.models.dataset import ( Dataset, - GenerateDataset + GenerateDataset, + generateSummaryStatsElements, + registerOutputs ) from fairscape_cli.models.software import Software, GenerateSoftware from fairscape_cli.models.computation import Computation, GenerateComputation @@ -9,13 +11,16 @@ GenerateROCrate, ReadROCrateMetadata, AppendCrate, - CopyToROCrate + CopyToROCrate, + UpdateCrate ) from fairscape_cli.models.bagit import BagIt __all__ = [ 'Dataset', 'GenerateDataset', + 'generateSummaryStatsElements', + 'registerOutputs', 'Software', 'GenerateSoftware', 'Computation', @@ -25,5 +30,6 @@ 'ReadROCrateMetadata', 'AppendCrate', 'CopyToROCrate', + 'UpdateCrate', 'BagIt' ] diff --git a/src/fairscape_cli/models/computation.py b/src/fairscape_cli/models/computation.py index fa3f7f3..24e67ac 100644 --- a/src/fairscape_cli/models/computation.py +++ b/src/fairscape_cli/models/computation.py @@ -1,19 +1,12 @@ -from fairscape_cli.models.base import FairscapeBaseModel -from fairscape_cli.models.utils import GenerateDatetimeSquid -from fairscape_cli.config import NAAN - -from typing import ( - Optional, - List, - Union, - Dict, -) -from pydantic import ( - Field, - AnyUrl -) import re from datetime import datetime +from typing import Optional, List, Union, Dict + +from pydantic import Field, AnyUrl + +from fairscape_cli.config import NAAN +from fairscape_cli.models.base import FairscapeBaseModel +from fairscape_cli.models.guid_utils import GenerateDatetimeSquid class Computation(FairscapeBaseModel): @@ -67,7 +60,7 @@ def GenerateComputation( computation_model = Computation.model_validate( { "@id": guid, - "@type": "https://w2id.org/EVI#Computation", + "@type": "https://w3id.org/EVI#Computation", "name": name, "description": description, "keywords": keywords, diff --git a/src/fairscape_cli/models/dataset.py b/src/fairscape_cli/models/dataset.py index 1ca3a8f..9f662ff 100644 --- a/src/fairscape_cli/models/dataset.py +++ b/src/fairscape_cli/models/dataset.py @@ -1,22 +1,7 @@ -from fairscape_cli.models.base import ( - FairscapeBaseModel, - Identifier -) -from fairscape_cli.config import ( - NAAN -) -from fairscape_cli.models.utils import GenerateDatetimeSquid, FileNotInCrateException -from fairscape_cli.models.schema.tabular import ( - TabularValidationSchema -) - +# Standard library imports import pathlib -from typing import ( - Optional, - List, - Union, - Dict -) +from datetime import datetime +from typing import Optional, List, Union, Dict, Tuple, Set from pydantic import ( BaseModel, @@ -25,7 +10,10 @@ AnyUrl, field_serializer ) -from datetime import datetime + +from fairscape_cli.models.base import FairscapeBaseModel +from fairscape_cli.models.guid_utils import GenerateDatetimeSquid +from fairscape_cli.config import NAAN class Dataset(FairscapeBaseModel): @@ -44,6 +32,7 @@ class Dataset(FairscapeBaseModel): derivedFrom: Optional[List[str]] = Field(default=[]) usedBy: Optional[List[str]] = Field(default=[]) contentUrl: Optional[str] = Field(default=None) + hasSummaryStatistics: Optional[Union[str, List[str]]] = Field(default=None) #@field_serializer('datePublished') #def serialize_date_published(self, datePublished: datetime): @@ -68,11 +57,13 @@ def GenerateDataset( usedBy: Optional[List[str]], generatedBy: Optional[List[str]], filepath: Optional[str], - cratePath + cratePath, + summary_stats_guid: Optional[str] = None ): - sq = GenerateDatetimeSquid() - guid = f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-{sq}" + if not guid: + sq = GenerateDatetimeSquid() + guid = f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-{sq}" datasetMetadata = { "@id": guid, @@ -88,22 +79,14 @@ def GenerateDataset( "additionalDocumentation": additionalDocumentation, "format": dataFormat, "schema": schema, - # sanitize input lists of newline breaks - "derivedFrom": [ - derived.strip("\n") for derived in derivedFrom - ], - "usedBy": [ - used.strip("\n") for used in usedBy - ], - "generatedBy": [ - gen.strip("\n") for gen in generatedBy - ] + "derivedFrom": [derived.strip("\n") for derived in derivedFrom], + "usedBy": [used.strip("\n") for used in usedBy], + "generatedBy": [gen.strip("\n") for gen in generatedBy], + "hasSummaryStatistics": summary_stats_guid } - datasetMetadata['contentURL'] = setRelativeFilepath(cratePath, filepath) - + datasetMetadata['contentUrl'] = setRelativeFilepath(cratePath, filepath) datasetInstance = Dataset.model_validate(datasetMetadata) - return datasetInstance @@ -136,4 +119,117 @@ def setRelativeFilepath(cratePath, filePath): # if relative filepath datasetPath = pathlib.Path(filePath).absolute() relativePath = datasetPath.relative_to(rocratePath) - return f"file:///{str(relativePath)}" \ No newline at end of file + return f"file:///{str(relativePath)}" + + +from fairscape_cli.models.computation import GenerateComputation, Computation +def generateSummaryStatsElements( + name: str, + author: str, + keywords: List[str], + date_published: str, + version: str, + associated_publication: Optional[str], + additional_documentation: Optional[str], + schema: Optional[str], + dataset_guid: str, + summary_statistics_filepath: str, + crate_path: pathlib.Path +) -> Tuple[str, Dataset, Computation]: + """Generate summary statistics dataset and computation elements + + Args: + name: Name of the main dataset + author: Author of the dataset + keywords: Dataset keywords + date_published: Publication date + version: Dataset version + associated_publication: Optional associated publication + additional_documentation: Optional additional documentation + schema: Optional schema + dataset_guid: GUID of the main dataset + summary_statistics_filepath: Path to summary statistics file + crate_path: Path to RO-Crate + + Returns: + Tuple containing: + - Summary statistics GUID + - Summary statistics Dataset instance + - Computation instance that generated the summary statistics + """ + # Generate GUIDs + sq_stats = GenerateDatetimeSquid() + summary_stats_guid = f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-stats-{sq_stats}" + + sq_comp = GenerateDatetimeSquid() + computation_guid = f"ark:{NAAN}/computation-{name.lower().replace(' ', '-')}-stats-{sq_comp}" + + # Create computation instance + computation_instance = GenerateComputation( + guid=computation_guid, + name=f"Summary Statistics Computation for {name}", + runBy=author, + command="", + dateCreated=date_published, + description=f"Computation that generated summary statistics for dataset: {name}", + keywords=keywords, + usedSoftware=[], + usedDataset=[dataset_guid], + generated=[summary_stats_guid] + ) + + # Create summary statistics dataset + summary_stats_instance = GenerateDataset( + guid=summary_stats_guid, + url=None, + author=author, + name=f"{name} - Summary Statistics", + description=f"Summary statistics for dataset: {name}", + keywords=keywords, + datePublished=date_published, + version=version, + associatedPublication=associated_publication, + additionalDocumentation=additional_documentation, + dataFormat='pdf', + schema=schema, + derivedFrom=[], + generatedBy=[computation_guid], + usedBy=[], + filepath=summary_statistics_filepath, + cratePath=crate_path, + summary_stats_guid=None + ) + + return summary_stats_guid, summary_stats_instance, computation_instance + +def registerOutputs( + new_files: Set[pathlib.Path], + computation_id: str, + dataset_id: str, + author: str +) -> List[Dict]: + """Register all outputs as datasets""" + output_instances = [] + for file_path in new_files: + file_path_str = str(file_path) + output_instance = GenerateDataset( + guid=None, + name=f"Statistics Output - {file_path.name}", + author=author, # Use the original author + description=f"Statistical analysis output for {dataset_id}", + keywords=["statistics"], + datePublished=datetime.now().isoformat(), + version="1.0", + dataFormat=file_path.suffix[1:], + filepath=file_path_str, + cratePath=str(file_path.parent), + url=None, + associatedPublication=None, + additionalDocumentation=None, + schema=None, + derivedFrom=[], + usedBy=[], + generatedBy=[computation_id] + ) + output_instances.append(output_instance) + return output_instances \ No newline at end of file diff --git a/src/fairscape_cli/models/guid_utils.py b/src/fairscape_cli/models/guid_utils.py new file mode 100644 index 0000000..a85988f --- /dev/null +++ b/src/fairscape_cli/models/guid_utils.py @@ -0,0 +1,31 @@ +from sqids import Sqids +import random +import datetime + +from typing import Set, Dict, List, Optional, Tuple + +from fairscape_cli.config import NAAN + +squids = Sqids(min_length=6) + +def GenerateDatetimeSquid(): + try: + timestamp_int = int(datetime.datetime.now(datetime.UTC).timestamp()) + sq = squids.encode([timestamp_int, random.randint(0, 10000)]) + except: + timestamp_int = int(datetime.datetime.utcnow().timestamp()) + sq = squids.encode([timestamp_int]) + return sq + +def GenerateDatetimeGUID(prefix: str)->str: + try: + timestamp_int = int(datetime.datetime.now(datetime.UTC).timestamp()) + sq = squids.encode([timestamp_int]) + except: + timestamp_int = int(datetime.datetime.utcnow().timestamp()) + sq = squids.encode([timestamp_int]) + return f"ark:{NAAN}/{prefix}-{sq}" + +def GenerateGUID(data: List[int], prefix: str)-> str: + squid_encoded = squids.encode(data) + return f"ark:{NAAN}/{prefix}-{squid_encoded}" \ No newline at end of file diff --git a/src/fairscape_cli/models/rocrate.py b/src/fairscape_cli/models/rocrate.py index 91c8ade..275c8b1 100644 --- a/src/fairscape_cli/models/rocrate.py +++ b/src/fairscape_cli/models/rocrate.py @@ -1,30 +1,16 @@ -from fairscape_cli.models import ( - Software, - Dataset, - Computation -) -from fairscape_cli.models.utils import GenerateDatetimeSquid -from fairscape_cli.config import ( - DEFAULT_CONTEXT, - NAAN -) - import pathlib import shutil import json +from typing import Optional, Union, List, Literal, Dict + from prettytable import PrettyTable -from pydantic import ( - BaseModel, - computed_field, - Field, -) -from typing import ( - Optional, - Union, - List, - Literal, - Dict -) +from pydantic import BaseModel, computed_field, Field + +from fairscape_cli.config import NAAN, DEFAULT_CONTEXT +from fairscape_cli.models.software import Software +from fairscape_cli.models.dataset import Dataset +from fairscape_cli.models.computation import Computation +from fairscape_cli.models.guid_utils import GenerateDatetimeSquid class ROCrateMetadata(BaseModel): guid: Optional[str] = Field(alias="@id", default=None) @@ -321,3 +307,32 @@ def CopyToROCrate(source_filepath: str, destination_filepath: str): # copy the file into the destinationPath shutil.copy(source_path, destination_path) +def UpdateCrate( + cratePath: pathlib.Path, + element: Union[Dataset, Software, Computation] +): + """Update an existing element in the RO-Crate metadata by matching @id + + Args: + cratePath: Path to the RO-Crate directory or metadata file + element: Updated element to replace existing one with matching @id + """ + if cratePath.is_dir(): + cratePath = cratePath / 'ro-crate-metadata.json' + + with cratePath.open("r+") as rocrate_metadata_file: + rocrate_metadata = json.load(rocrate_metadata_file) + + # Find and replace the element with matching @id + for i, existing in enumerate(rocrate_metadata['@graph']): + if existing.get('@id') == element.guid: + rocrate_metadata['@graph'][i] = element.model_dump( + by_alias=True, + exclude_none=True + ) + break + + # Write back the updated metadata + rocrate_metadata_file.seek(0) + rocrate_metadata_file.truncate() + json.dump(rocrate_metadata, rocrate_metadata_file, indent=2) \ No newline at end of file diff --git a/src/fairscape_cli/models/schema/tabular.py b/src/fairscape_cli/models/schema/tabular.py index 387444a..fda4551 100644 --- a/src/fairscape_cli/models/schema/tabular.py +++ b/src/fairscape_cli/models/schema/tabular.py @@ -1,17 +1,13 @@ -import jsonschema import pathlib -from functools import lru_cache import os import json import pandas as pd -import pyarrow.parquet as pq -import pyarrow.compute as pc import h5py +from datetime import datetime from enum import Enum from pydantic import ( BaseModel, ConfigDict, - computed_field, Field, ValidationError, model_validator @@ -19,21 +15,16 @@ from typing import ( Dict, List, - Optional, - Union, + Optional, Literal, - Type + Union ) +from frictionless import Schema, Resource, describe, fields + from fairscape_cli.models.schema.utils import ( - GenerateSlice, PropertyNameException, ColumnIndexException, - map_arrow_type_to_json_schema -) - -from fairscape_cli.models.utils import ( - GenerateDatetimeSquid ) from fairscape_cli.config import ( @@ -46,14 +37,11 @@ class FileType(str, Enum): CSV = "csv" TSV = "tsv" PARQUET = "parquet" - HDF5 = "h5" @classmethod def from_extension(cls, filepath: str) -> 'FileType': - ext = pathlib.Path(filepath).suffix.lower()[1:] # Remove the dot - if ext == 'h5' or ext == 'hdf5': - return cls.HDF5 - elif ext == 'parquet': + ext = pathlib.Path(filepath).suffix.lower()[1:] + if ext == 'parquet': return cls.PARQUET elif ext == 'tsv': return cls.TSV @@ -62,6 +50,14 @@ def from_extension(cls, filepath: str) -> 'FileType': else: raise ValueError(f"Unsupported file extension: {ext}") +class ValidationError(BaseModel): + message: str + row: Optional[int] = None + field: Optional[str] = None + type: str = "ValidationError" + failed_keyword: str + path: Optional[str] = None + class DatatypeEnum(str, Enum): NULL = "null" BOOLEAN = "boolean" @@ -142,250 +138,220 @@ def check_max_min(self) -> 'IntegerProperty': raise ValueError('IntegerProperty attribute maximum !< minimum') return self -class BaseSchema(BaseModel): +def frictionless_type_to_json_schema(field_type: str) -> str: + """Convert Frictionless types to JSON Schema types""" + type_mapping = { + 'string': 'string', + 'integer': 'integer', + 'number': 'number', + 'boolean': 'boolean', + 'date': 'string', + 'datetime': 'string', + 'year': 'integer', + 'yearmonth': 'string', + 'duration': 'string', + 'geopoint': 'array', + 'geojson': 'object', + 'array': 'array', + 'object': 'object', + 'time': 'string' + } + return type_mapping.get(field_type, 'string') + +class TabularValidationSchema(BaseModel): + model_config = ConfigDict(populate_by_name=True) + guid: Optional[str] = Field(alias="@id", default=None) context: Optional[Dict] = Field(default=DEFAULT_CONTEXT, alias="@context") metadataType: Optional[str] = Field(default=DEFAULT_SCHEMA_TYPE, alias="@type") - schema_version: str = Field(default="https://json-schema.org/draft/2020-12/schema", alias="schema") + schema_version: str = Field(default="https://json-schema.org/draft/2020-12/schema", alias="$schema") name: str description: str datatype: str = Field(default="object", alias="type") + separator: str = Field(description="Field separator for the file") + header: bool = Field(description="Do files of this schema have a header row", default=True) + required: List[str] = Field(default=[]) + properties: Dict[str, Dict] = Field(default={}) additionalProperties: bool = Field(default=True) - required: List[str] = Field(description="list of required properties by name", default=[]) - examples: Optional[List[Dict[str, str]]] = Field(default=[]) + + # Store the frictionless schema + _frictionless_schema: Optional[Schema] = None def generate_guid(self) -> str: + """Generate a unique identifier for the schema""" if self.guid is None: prefix = f"schema-{self.name.lower().replace(' ', '-')}" - sq = GenerateDatetimeSquid() - self.guid = f"ark:{NAAN}/{prefix}-{sq}" + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + self.guid = f"ark:{NAAN}/{prefix}-{timestamp}" return self.guid - + @model_validator(mode='after') - def generate_all_guids(self) -> 'BaseSchema': + def generate_all_guids(self) -> 'TabularValidationSchema': """Generate GUIDs for this schema and any nested schemas""" self.generate_guid() - - # Generate GUIDs for any nested schemas in properties - if hasattr(self, 'properties'): - for prop in self.properties.values(): - if isinstance(prop, BaseSchema): - prop.generate_guid() - return self - - def to_json_schema(self) -> dict: - """Convert the HDF5Schema to JSON Schema format""" - schema = self.model_dump( - by_alias=True, - exclude_unset=True, - exclude_none=True - ) - return schema - -PropertyUnion = Union[StringProperty, ArrayProperty, BooleanProperty, NumberProperty, IntegerProperty, NullProperty] -class TabularValidationSchema(BaseSchema): - properties: Dict[str, PropertyUnion] = Field(default={}) - separator: str = Field(description="Field separator for the file") - header: bool = Field(description="Do files of this schema have a header row", default=False) @classmethod def infer_from_file(cls, filepath: str, name: str, description: str, include_min_max: bool = False) -> 'TabularValidationSchema': - """Infer schema from a file""" + """Infer schema from a file using Frictionless""" file_type = FileType.from_extension(filepath) + separator = '\t' if file_type == FileType.TSV else ',' - if file_type == FileType.PARQUET: - return cls.infer_from_parquet(name, description, None, filepath, include_min_max) - else: # csv or tsv - separator = '\t' if file_type == FileType.TSV else ',' - df = pd.read_csv(filepath, sep=separator) - return cls.infer_from_dataframe(df, name, description, include_min_max, separator) - - @classmethod - def infer_from_dataframe(cls, df: pd.DataFrame, name: str, description: str, include_min_max: bool = False, separator: str = ',') -> 'TabularValidationSchema': - """Infer schema from a pandas DataFrame""" - type_map = { - 'int16': ('integer', IntegerProperty, int), - 'int32': ('integer', IntegerProperty, int), - 'int64': ('integer', IntegerProperty, int), - 'uint8': ('integer', IntegerProperty, int), - 'uint16': ('integer', IntegerProperty, int), - 'uint32': ('integer', IntegerProperty, int), - 'uint64': ('integer', IntegerProperty, int), - 'float16': ('number', NumberProperty, float), - 'float32': ('number', NumberProperty, float), - 'float64': ('number', NumberProperty, float), - 'bool': ('boolean', BooleanProperty, None), - } + resource = describe(filepath) properties = {} - for i, (column_name, dtype) in enumerate(df.dtypes.items()): - dtype_str = str(dtype) - datatype, property_class, converter = type_map.get(dtype_str, ('string', StringProperty, None)) + required_fields = [] + + for i, field in enumerate(resource.schema.fields): + json_schema_type = frictionless_type_to_json_schema(field.type) - kwargs = { - "datatype": datatype, - "description": f"Column {column_name}", + property_def = { + "type": json_schema_type, + "description": field.description or f"Column {field.name}", "index": i } - - if include_min_max and converter: - kwargs.update({ - "minimum": converter(df[column_name].min()), - "maximum": converter(df[column_name].max()) - }) - - properties[column_name] = property_class(**kwargs) + + properties[field.name] = property_def + required_fields.append(field.name) - return cls( + # Create our schema instance + schema = cls( name=name, description=description, - properties=properties, - required=list(properties.keys()), separator=separator, - header=True + header=True, + properties=properties, + required=required_fields ) + + # Store the frictionless schema for validation + schema._frictionless_schema = resource.schema + return schema - @classmethod - def infer_from_parquet(cls, name: str, description: str, guid: Optional[str], filepath: str, include_min_max: bool = False) -> 'TabularValidationSchema': - """Infer schema from a Parquet file""" - table = pq.read_table(filepath) - schema = table.schema - properties = {} - - for i, field in enumerate(schema): - field_name = field.name - field_type = map_arrow_type_to_json_schema(field.type) - - if field_type == 'string': - properties[field_name] = StringProperty( - datatype='string', - description=f"Column {field_name}", - index=i - ) - elif field_type == 'integer': - if include_min_max: - column = table.column(field_name) - min_max = pc.min_max(column) - properties[field_name] = IntegerProperty( - datatype='integer', - description=f"Column {field_name}", - index=i, - minimum=min_max['min'].as_py(), - maximum=min_max['max'].as_py() - ) - else: - properties[field_name] = IntegerProperty( - datatype='integer', - description=f"Column {field_name}", - index=i - ) - elif field_type == 'number': - if include_min_max: - column = table.column(field_name) - min_max = pc.min_max(column) - properties[field_name] = NumberProperty( - datatype='number', - description=f"Column {field_name}", - index=i, - minimum=min_max['min'].as_py(), - maximum=min_max['max'].as_py() + def validate_file(self, filepath: str) -> List[ValidationError]: + """Validate a file against the schema using Frictionless""" + if not self._frictionless_schema: + raise ValueError("Schema not properly initialized") + + resource = Resource( + path=os.path.basename(filepath), + basepath=os.path.dirname(filepath), + schema=self._frictionless_schema + ) + report = resource.validate() + + errors = [] + for task in report.tasks: + for error in task.errors: + if isinstance(error, TypeError): + validation_error = ValidationError( + message=str(error), + type="ValidationError", + failed_keyword="type" ) else: - properties[field_name] = NumberProperty( - datatype='number', - description=f"Column {field_name}", - index=i + validation_error = ValidationError( + message=error.message, + row=error.row_number if hasattr(error, 'row_number') else None, + field=error.field_name if hasattr(error, 'field_name') else None, + failed_keyword=error.code if hasattr(error, 'code') else "error" ) - elif field_type == 'boolean': - properties[field_name] = BooleanProperty( - datatype='boolean', - description=f"Column {field_name}", - index=i - ) + errors.append(validation_error) + + return errors - return cls( - name=name, - description=description, - guid=guid, - properties=properties, - required=list(properties.keys()), - separator=",", # Not used for parquet but required - header=True # Not used for parquet but required - ) + def to_dict(self) -> dict: + """Convert the schema to a dictionary format""" + return self.model_dump(by_alias=True, exclude={'_frictionless_schema'}) - def validate_file(self, filepath: str) -> List[Dict]: - """Validate a file against the schema""" - file_type = FileType.from_extension(filepath) + @classmethod + def from_dict(cls, data: dict) -> 'TabularValidationSchema': + """Create a schema instance from a dictionary""" + properties = data.pop('properties', {}) + required_fields = data.pop('required', []) - if file_type == FileType.PARQUET: - df = pd.read_parquet(filepath) - else: # csv or tsv - sep = '\t' if file_type == FileType.TSV else self.separator - df = pd.read_csv(filepath, sep=sep, header=0 if self.header else None) + frictionless_schema = Schema() - return self.validate_dataframe(df) - - def validate_dataframe(self, df: pd.DataFrame) -> List[Dict]: - """Validate a dataframe against the schema with lenient string type checking. - Only reports string validation errors for pattern mismatches, not type mismatches.""" - json_schema = self.to_json_schema() - validator = jsonschema.Draft202012Validator(json_schema) - errors = [] - - for i, row in df.iterrows(): - row_dict = row.to_dict() - validation_errors = sorted(validator.iter_errors(row_dict), key=lambda e: e.path) + type_to_field = { + 'string': fields.StringField, + 'integer': fields.IntegerField, + 'number': fields.NumberField, + 'boolean': fields.BooleanField, + 'array': fields.ArrayField + } + + for name, prop in properties.items(): + field_type = type_to_field.get(prop.get('type', 'string'), fields.StringField) + field = field_type( + name=name, + description=prop.get('description', ''), + constraints={} + ) - for err in validation_errors: - # Skip type validation errors for string fields unless there's a pattern mismatch - if err.validator == "type": - field_name = list(err.path)[-1] if err.path else None - if field_name in self.properties: - prop = self.properties[field_name] - if prop.datatype == "string": - # Skip string type validation errors - continue + # Add constraints if they exist + if 'minimum' in prop: + field.constraints['minimum'] = prop['minimum'] + if 'maximum' in prop: + field.constraints['maximum'] = prop['maximum'] + if 'pattern' in prop: + field.constraints['pattern'] = prop['pattern'] + if 'minLength' in prop: + field.constraints['minLength'] = prop['minLength'] + if 'maxLength' in prop: + field.constraints['maxLength'] = prop['maxLength'] - # Include all other validation errors - errors.append({ - "message": err.message, - "row": i, - "field": list(err.path)[-1] if err.path else None, - "type": "ValidationError", - "failed_keyword": err.validator - }) - - return errors + frictionless_schema.add_field(field) + + # Create our schema instance + schema = cls(**data, properties=properties, required=required_fields) + schema._frictionless_schema = frictionless_schema + return schema -class HDF5Schema(BaseSchema): +class HDF5ValidationSchema(BaseModel): + guid: Optional[str] = Field(alias="@id", default=None) + context: Optional[Dict] = Field(default=DEFAULT_CONTEXT, alias="@context") + name: str + description: str properties: Dict[str, TabularValidationSchema] = Field(default={}) + required: List[str] = Field(default=[]) + def generate_guid(self) -> str: + """Generate a unique identifier for the schema""" + if self.guid is None: + prefix = f"schema-{self.name.lower().replace(' ', '-')}" + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + self.guid = f"ark:{NAAN}/{prefix}-{timestamp}" + return self.guid + + @model_validator(mode='after') + def generate_all_guids(self) -> 'HDF5ValidationSchema': + """Generate GUIDs for this schema and any nested schemas""" + self.generate_guid() + return self + @staticmethod def dataset_to_dataframe(dataset: h5py.Dataset) -> pd.DataFrame: - """Convert any HDF5 dataset to a pandas DataFrame""" + """Convert an HDF5 dataset to a pandas DataFrame""" data = dataset[()] - # structured array convert directly - if dataset.dtype.fields: + if dataset.dtype.fields: # Structured array return pd.DataFrame(data) - - # For multi-dimensional arrays make up column name - elif len(dataset.shape) > 1: - n_cols = dataset.shape[1] if len(dataset.shape) > 1 else 1 + elif len(dataset.shape) > 1: # Multi-dimensional array + n_cols = dataset.shape[1] columns = [f"column_{i}" for i in range(n_cols)] return pd.DataFrame(data, columns=columns) - - # For 1D arrays convert to single column DataFrame - else: + else: # 1D array return pd.DataFrame(data, columns=['value']) - @classmethod - def infer_from_file(cls, filepath: str, name: str, description: str, include_min_max: bool = False) -> 'HDF5Schema': - """Infer schema from HDF5 file""" - schema = cls(name=name, description=description) + @classmethod + def infer_from_file(cls, filepath: str, name: str, description: str) -> 'HDF5ValidationSchema': + """Infer schema from an HDF5 file""" + schema = cls( + name=name, + description=description + ) properties = {} - + with h5py.File(filepath, 'r') as f: def process_group(group, parent_path=""): for key, item in group.items(): @@ -394,98 +360,139 @@ def process_group(group, parent_path=""): if isinstance(item, h5py.Dataset): try: df = cls.dataset_to_dataframe(item) - properties[path] = TabularValidationSchema.infer_from_dataframe( - df, + resource = describe(df) + + tabular_schema = TabularValidationSchema( name=f"{name}_{path.replace('/', '_')}", description=f"Dataset at {path}", - include_min_max=include_min_max + separator=",", + header=True, + properties={}, + required=[], + context=None ) + + tabular_schema._frictionless_schema = resource.schema + + for i, field in enumerate(resource.schema.fields): + property_def = { + "type": field.type, + "description": field.description or f"Column {field.name}", + "index": i + } + + tabular_schema.properties[field.name] = property_def + tabular_schema.required.append(field.name) + + properties[path] = tabular_schema + except Exception as e: - print(f"Warning: Could not convert dataset {path} to DataFrame: {str(e)}") + print(f"Warning: Could not process dataset {path}: {str(e)}") elif isinstance(item, h5py.Group): - # Recursively process group contents process_group(item, path) - + process_group(f) schema.properties = properties schema.required = list(properties.keys()) return schema - def validate_file(self, filepath: str) -> List[Dict]: + def validate_file(self, filepath: str) -> List[ValidationError]: """Validate an HDF5 file against the schema""" errors = [] with h5py.File(filepath, 'r') as f: for path, schema in self.properties.items(): try: - # Try to get the dataset using the path dataset = f[path] if isinstance(dataset, h5py.Dataset): - # Convert dataset to DataFrame df = self.dataset_to_dataframe(dataset) - # Validate using the TabularValidationSchema's validate_dataframe method - dataset_errors = schema.validate_dataframe(df) - # Add path information to errors - for error in dataset_errors: - error['path'] = path - errors.extend(dataset_errors) + resource = Resource(data=df, schema=schema._frictionless_schema) + report = resource.validate() + + for task in report.tasks: + for error in task.errors: + # Skip string type errors + if (hasattr(error, 'type') and error.type == 'type-error' and + hasattr(error, 'note') and 'type is "string' in error.note): + continue + + validation_error = ValidationError( + message=error.message, + row=error.rowNumber if hasattr(error, 'rowNumber') else None, + field=error.fieldName if hasattr(error, 'fieldName') else None, + type="ValidationError", + failed_keyword=error.type if hasattr(error, 'type') else "error", + path=path + ) + errors.append(validation_error) + except KeyError: - errors.append({ - "message": f"Dataset {path} not found", - "path": path, - "type": "ValidationError", - "failed_keyword": "required" - }) + errors.append(ValidationError( + message=f"Dataset {path} not found", + type="ValidationError", + failed_keyword="required", + path=path + )) except Exception as e: - errors.append({ - "message": f"Error validating dataset {path}: {str(e)}", - "path": path, - "type": "ValidationError", - "failed_keyword": "format" - }) + errors.append(ValidationError( + message=f"Error validating dataset {path}: {str(e)}", + type="ValidationError", + failed_keyword="format", + path=path + )) return errors + + def to_dict(self) -> dict: + """Convert the schema to a dictionary format including all fields""" + return self.model_dump(by_alias=True) + @classmethod + def from_dict(cls, data: dict) -> 'HDF5ValidationSchema': + """Create a schema instance from a dictionary""" + properties = { + path: TabularValidationSchema.from_dict(schema_dict) + for path, schema_dict in data.get('properties', {}).items() + } + + return cls( + name=data['name'], + description=data['description'], + properties=properties, + required=data.get('required', []) + ) + +def write_schema(schema: TabularValidationSchema, output_file: str): + """Write a schema to a file""" + schema_dict = schema.to_dict() -def AppendProperty(schemaFilepath: str, propertyInstance, propertyName: str) -> None: + with open(output_file, 'w') as f: + json.dump(schema_dict, f, indent=2) + +def AppendProperty(schemaFilepath: str, propertyInstance, propertyName: str) -> None: # check that schemaFile exists schemaPath = pathlib.Path(schemaFilepath) - if not schemaPath.exists(): raise Exception with schemaPath.open("r+") as schemaFile: schemaFileContents = schemaFile.read() - schemaJson = json.loads(schemaFileContents) + schemaJson = json.loads(schemaFileContents) - # load the model into a tabular validation schema schemaModel = TabularValidationSchema.model_validate(schemaJson) - # TODO check for inconsitencies - - # does there exist a property with same name if propertyName in [key for key in schemaModel.properties.keys()]: raise PropertyNameException(propertyName) - # does there exist a property with same column number - schema_indicies = [ val.index for val in schemaModel.properties.values()] - - # check overlap of indicies - # CheckOverlap - - - # add new property to schema + schema_indicies = [val['index'] for val in schemaModel.properties.values()] + schemaModel.properties[propertyName] = propertyInstance - - # add new property as required schemaModel.required.append(propertyName) + schemaJson = json.dumps(schemaModel.model_dump(by_alias=True, exclude_none=True), indent=2) - # serialize model to json - schemaJson = json.dumps(schemaModel.model_dump(by_alias=True) , indent=2) - - # overwrite file contents + # overwrite file contents schemaFile.seek(0) schemaFile.write(schemaJson) @@ -525,64 +532,3 @@ def ReadSchemaLocal(schemaFile: str) -> TabularValidationSchema: tabularSchema = TabularValidationSchema.model_validate(schemaJson) return tabularSchema -def ReadSchema(schemaFile:str) -> TabularValidationSchema: - ''' Read a schema specified by the argument schemaFile - - The schemaFile parameter can be a url to a rawgithub link, or an ark identifier. - If the ark identifier is in the supplied, default schemas provided in the fairscape cli pacakges will be searched. - If there is no match then - ''' - - if 'raw.githubusercontent' in schemaFile: - schemaInstance = ReadSchemaGithub(schemaFile) - return schemaInstance - - - elif 'ark' in schemaFile: - defaultSchemas = ImportDefaultSchemas() - matchingSchemas = list(filter(lambda schema: schema.guid == str(schemaFile), defaultSchemas)) - - if len(matchingSchemas) == 0: - # request against fairscape - schemaInstance = ReadSchemaFairscape(schemaFile) - return schemaInstance - else: - defaultSchema = matchingSchemas[0] - return defaultSchema - - else: - # schema must be a path that exists - schemaInstance = ReadSchemaLocal(schemaFile) - return schemaInstance - -def WriteSchema(tabular_schema: TabularValidationSchema, schema_file): - """ Helper Function for writing files - """ - - schema_dictionary = tabular_schema.model_dump(by_alias=True) - schema_json = json.dumps(schema_dictionary, indent=2) - - # dump json to a file - with open(schema_file, "w") as output_file: - output_file.write(schema_json) - -@lru_cache -def ImportDefaultSchemas()-> List[TabularValidationSchema]: - defaultSchemaLocation = pathlib.Path(os.path.dirname(os.path.realpath(__file__))) / 'default_schemas' - schemaPaths = list(defaultSchemaLocation.rglob("*/*.json")) - - defaultSchemaList = [] - for schemaPathElem in schemaPaths: - - with schemaPathElem.open("r") as inputSchema: - inputSchemaData = inputSchema.read() - schemaJson = json.loads(inputSchemaData) - - try: - schemaElem = TabularValidationSchema.model_validate(schemaJson) - defaultSchemaList.append(schemaElem) - except: - # TODO handle validation failures from default schemas - pass - - return defaultSchemaList diff --git a/src/fairscape_cli/models/software.py b/src/fairscape_cli/models/software.py index fb60242..83ebda0 100644 --- a/src/fairscape_cli/models/software.py +++ b/src/fairscape_cli/models/software.py @@ -1,21 +1,12 @@ -from fairscape_cli.models.base import FairscapeBaseModel -from fairscape_cli.models.utils import GenerateDatetimeSquid, FileNotInCrateException -from fairscape_cli.config import NAAN import pathlib - -from pydantic import ( - Field, - AnyUrl, - ConfigDict -) from datetime import datetime -from typing import ( - Optional, - Union, - Dict, - List -) +from typing import Optional, Union, Dict, List +from pydantic import Field, AnyUrl, ConfigDict + +from fairscape_cli.config import NAAN +from fairscape_cli.models.base import FairscapeBaseModel +from fairscape_cli.models.guid_utils import GenerateDatetimeSquid class Software(FairscapeBaseModel): diff --git a/src/fairscape_cli/models/utils.py b/src/fairscape_cli/models/utils.py index b51c6e9..dfe270a 100644 --- a/src/fairscape_cli/models/utils.py +++ b/src/fairscape_cli/models/utils.py @@ -1,70 +1,45 @@ -# Python Interface for Registering Unique GUIDS -from sqids import Sqids -from pydantic import ( - ValidationError -) -from typing import ( - List - ) -import datetime -from fairscape_cli.config import ( - NAAN - ) -import random +from pathlib import Path +from typing import Set, Dict, List, Optional, Tuple +import subprocess -squids = Sqids(min_length=6) - -def GenerateDatetimeSquid(): - try: - timestamp_int = int(datetime.datetime.now(datetime.UTC).timestamp()) - sq = squids.encode([timestamp_int, random.randint(0, 10000)]) - except: - timestamp_int = int(datetime.datetime.utcnow().timestamp()) - sq = squids.encode([timestamp_int]) - - return sq - - -def GenerateDatetimeGUID(prefix: str)->str: - try: - timestamp_int = int(datetime.datetime.now(datetime.UTC).timestamp()) - sq = squids.encode([timestamp_int]) - except: - timestamp_int = int(datetime.datetime.utcnow().timestamp()) - sq = squids.encode([timestamp_int]) - - return f"ark:{NAAN}/{prefix}-{sq}" - -def GenerateGUID(data: List[int], prefix: str)-> str: - squid_encoded = squids.encode(data) - return f"ark:{NAAN}/{prefix}-{squid_encoded}" +from pydantic import ValidationError +from fairscape_cli.models.base import FairscapeBaseModel def InstantiateModel(ctx, metadata: dict, modelInstance): try: modelInstance.model_validate(metadata) return modelInstance - except ValidationError as metadataError: print('ERROR: MetadataValidationError', end='') for validationFailure in metadataError.errors(): print(f'loc: {validationFailure.loc}\tinput: {validationFailure.input}\tmsg: {validationFailure.msg}', end='') ctx.exit(code=1) - - -def ValidateGUID(ctx, param, value): - """ Make sure a GUID reference is reachable return JSON Metadata - """ - # validate fairscape ARK - - # validate DOI - - # validate url - pass - - class FileNotInCrateException(Exception): def __init__(self, cratePath, filePath): self.message = f"Error: FileNotFound inside ro crate\ncratePath: {str(cratePath)}\tfilePath{str(filePath)}" super().__init__(self.message) + +def getDirectoryContents(directory: Path) -> Set[Path]: + """Get set of all files in directory recursively""" + return set(p for p in directory.rglob('*') if p.is_file()) + +def run_command(command: str) -> Tuple[bool, str, str]: + """Execute command and return success status with output""" + try: + result = subprocess.run( + command.split(), + capture_output=True, + text=True + ) + return result.returncode == 0, result.stdout, result.stderr + except Exception as e: + return False, "", str(e) + +def getEntityFromCrate(crate_instance, entity_id: str) -> Optional[FairscapeBaseModel]: + """Get entity from crate by ID""" + for entity in crate_instance.metadataGraph: + if entity.guid == entity_id: + return entity.dict() + return None \ No newline at end of file diff --git a/src/fairscape_cli/rocrate/rocrate.py b/src/fairscape_cli/rocrate/rocrate.py index 2b0438b..7fb2c22 100644 --- a/src/fairscape_cli/rocrate/rocrate.py +++ b/src/fairscape_cli/rocrate/rocrate.py @@ -2,32 +2,42 @@ import pathlib import shutil import json -from pydantic import ValidationError from datetime import datetime +from typing import List, Optional, Union +from pydantic import ValidationError +from fairscape_cli.config import NAAN +from fairscape_cli.models.guid_utils import GenerateDatetimeSquid from fairscape_cli.models.utils import ( - FileNotInCrateException + FileNotInCrateException, + getDirectoryContents, + getEntityFromCrate, + run_command ) from fairscape_cli.models import ( + # Core models Dataset, - GenerateDataset, Software, - GenerateSoftware, Computation, + ROCrate, + BagIt, + + # Generator functions + GenerateDataset, + GenerateSoftware, GenerateComputation, GenerateROCrate, - ROCrate, + + # RO Crate operations ReadROCrateMetadata, AppendCrate, CopyToROCrate, - BagIt -) - -from typing import ( - List, - Optional, - Union + UpdateCrate, + + # Additional utilities + generateSummaryStatsElements, + registerOutputs ) @@ -204,6 +214,7 @@ def registerSoftware( @click.option('--keywords', required=True, multiple=True) @click.option('--data-format', required=True) @click.option('--filepath', required=True) +@click.option('--summary-statistics-filepath', required=False, type=click.Path(exists=True)) @click.option('--used-by', required=False, multiple=True) @click.option('--derived-from', required=False, multiple=True) @click.option('--generated-by', required=False, multiple=True) @@ -224,6 +235,7 @@ def registerDataset( keywords: List[str], data_format: str, filepath: str, + summary_statistics_filepath: Optional[str], used_by: Optional[List[str]], derived_from: Optional[List[str]], generated_by: Optional[List[str]], @@ -231,8 +243,7 @@ def registerDataset( associated_publication: Optional[str], additional_documentation: Optional[List[str]], ): - """Register Dataset object metadata with the specified RO-Crate - """ + """Register Dataset object metadata with the specified RO-Crate""" try: crate_instance = ReadROCrateMetadata(rocrate_path) except Exception as exc: @@ -240,8 +251,33 @@ def registerDataset( ctx.exit(code=1) try: + # Generate main dataset GUID + sq_dataset = GenerateDatetimeSquid() + dataset_guid = guid if guid else f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-{sq_dataset}" + + summary_stats_guid = None + elements = [] + + # Handle summary statistics if provided + if summary_statistics_filepath: + summary_stats_guid, summary_stats_instance, computation_instance = generateSummaryStatsElements( + name=name, + author=author, + keywords=keywords, + date_published=date_published, + version=version, + associated_publication=associated_publication, + additional_documentation=additional_documentation, + schema=schema, + dataset_guid=dataset_guid, + summary_statistics_filepath=summary_statistics_filepath, + crate_path=rocrate_path + ) + elements.extend([computation_instance, summary_stats_instance]) + + # Generate main dataset dataset_instance = GenerateDataset( - guid=guid, + guid=dataset_guid, url=url, author=author, name=name, @@ -257,9 +293,12 @@ def registerDataset( generatedBy=generated_by, usedBy=used_by, filepath=filepath, - cratePath=rocrate_path + cratePath=rocrate_path, + summary_stats_guid=summary_stats_guid ) - AppendCrate(cratePath = rocrate_path, elements=[dataset_instance]) + + elements.insert(0, dataset_instance) + AppendCrate(cratePath=rocrate_path, elements=elements) click.echo(dataset_instance.guid) except FileNotInCrateException as e: @@ -275,8 +314,6 @@ def registerDataset( click.echo(f"ERROR: {str(exc)}") ctx.exit(code=1) - - @register.command('computation') @click.argument('rocrate-path', type=click.Path(exists=True, path_type=pathlib.Path)) @@ -434,6 +471,8 @@ def software( @click.option('--data-format', required=True) @click.option('--source-filepath', required=True) @click.option('--destination-filepath', required=True) +@click.option('--summary-statistics-source', required=False, type=click.Path(exists=True)) +@click.option('--summary-statistics-destination', required=False, type=click.Path()) @click.option('--used-by', required=False, multiple=True) @click.option('--derived-from', required=False, multiple=True) @click.option('--generated-by', required=False, multiple=True) @@ -455,6 +494,8 @@ def dataset( data_format, source_filepath, destination_filepath, + summary_statistics_source, + summary_statistics_destination, used_by, derived_from, generated_by, @@ -462,9 +503,7 @@ def dataset( associated_publication, additional_documentation, ): - """Add a Dataset file and its metadata to the RO-Crate. - """ - + """Add a Dataset file and its metadata to the RO-Crate.""" try: crateInstance = ReadROCrateMetadata(rocrate_path) except Exception as exc: @@ -472,9 +511,40 @@ def dataset( ctx.exit(code=1) try: + # Copy main dataset file CopyToROCrate(source_filepath, destination_filepath) + + # Generate main dataset GUID + sq_dataset = GenerateDatetimeSquid() + dataset_guid = guid if guid else f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-{sq_dataset}" + + summary_stats_guid = None + elements = [] + + # Handle summary statistics if provided + if summary_statistics_source and summary_statistics_destination: + # Copy summary statistics file + CopyToROCrate(summary_statistics_source, summary_statistics_destination) + + # Generate summary statistics elements + summary_stats_guid, summary_stats_instance, computation_instance = generateSummaryStatsElements( + name=name, + author=author, + keywords=keywords, + date_published=date_published, + version=version, + associated_publication=associated_publication, + additional_documentation=additional_documentation, + schema=schema, + dataset_guid=dataset_guid, + summary_statistics_filepath=summary_statistics_destination, + crate_path=rocrate_path + ) + elements.extend([computation_instance, summary_stats_instance]) + + # Generate main dataset dataset_instance = GenerateDataset( - guid=guid, + guid=dataset_guid, url=url, author=author, name=name, @@ -490,9 +560,12 @@ def dataset( generatedBy=generated_by, usedBy=used_by, filepath=destination_filepath, - cratePath=rocrate_path + cratePath=rocrate_path, + summary_stats_guid=summary_stats_guid ) - AppendCrate(cratePath = rocrate_path, elements=[dataset_instance]) + + elements.insert(0, dataset_instance) + AppendCrate(cratePath=rocrate_path, elements=elements) click.echo(dataset_instance.guid) except ValidationError as e: @@ -503,5 +576,85 @@ def dataset( except Exception as exc: click.echo(f"ERROR: {str(exc)}") ctx.exit(code=1) + +################# +# Summary Statistics +################# +@rocrate.command('compute-statistics') +@click.argument('rocrate-path', type=click.Path(exists=True, path_type=pathlib.Path)) +@click.option('--dataset-id', required=True, help='ID of dataset to compute statistics for') +@click.option('--software-id', required=True, help='ID of software to run') +@click.option('--command', required=True, help='Python command to execute (e.g. python)') +@click.pass_context +def compute_statistics( + ctx, + rocrate_path: pathlib.Path, + dataset_id: str, + software_id: str, + command: str +): + """Compute statistics for a dataset using specified software""" + crate_instance = ReadROCrateMetadata(rocrate_path) + initial_files = getDirectoryContents(rocrate_path) - # TODO add to cache + # Get original dataset info + dataset_info = getEntityFromCrate(crate_instance, dataset_id) + software_info = getEntityFromCrate(crate_instance, software_id) + if not dataset_info or not software_info: + raise ValueError(f"Dataset or software not found in crate") + + # Get original dataset author + original_author = dataset_info.get("author", "Unknown") + dataset_path = dataset_info.get("contentUrl", "").replace("file:///", "") + software_path = software_info.get("contentUrl", "").replace("file:///", "") + + if not dataset_path or not software_path: + raise ValueError("Dataset or software path not found") + + full_command = f"{command} {software_path} {dataset_path} {rocrate_path}" + success, stdout, stderr = run_command(full_command) + if not success: + raise RuntimeError(f"Command failed: {stderr}") + + final_files = getDirectoryContents(rocrate_path) + new_files = final_files - initial_files + if not new_files: + raise RuntimeError("No output files generated") + + computation_instance = GenerateComputation( + guid=None, + name=f"Statistics Computation for {dataset_id}", + runBy="Fairscape-CLI", + command=full_command, + dateCreated=datetime.now().isoformat(), + description=f"Generated statistics\nstdout:\n{stdout}\nstderr:\n{stderr}", + keywords=["statistics"], + usedSoftware=[software_id], + usedDataset=[dataset_id], + generated=[] + ) + + output_instances = registerOutputs( + new_files=new_files, + computation_id=computation_instance.guid, + dataset_id=dataset_id, + author=original_author + ) + + stats_output = [out.guid for out in output_instances] + computation_instance.generated = stats_output + + if stats_output: + # Update the original dataset metadata + dataset_info["hasSummaryStatistics"] = stats_output + # Generate a new Dataset instance with updated metadata + updated_dataset = Dataset.model_validate(dataset_info) + + # Update the dataset in the crate and append new elements + UpdateCrate(cratePath=rocrate_path, element=updated_dataset) + AppendCrate( + cratePath=rocrate_path, + elements=[computation_instance] + output_instances + ) + + click.echo(computation_instance.guid) \ No newline at end of file diff --git a/src/fairscape_cli/schema/schema.py b/src/fairscape_cli/schema/schema.py index e17b9d3..a09cf2b 100644 --- a/src/fairscape_cli/schema/schema.py +++ b/src/fairscape_cli/schema/schema.py @@ -3,49 +3,37 @@ from prettytable import PrettyTable import pathlib from pydantic import ( - ValidationError + ValidationError ) from typing import ( Union, Type ) - from fairscape_cli.models.schema.tabular import ( TabularValidationSchema, - ReadSchema, - ImportDefaultSchemas, - WriteSchema, + HDF5ValidationSchema, + write_schema as WriteSchema, StringProperty, NumberProperty, IntegerProperty, BooleanProperty, ArrayProperty, ClickAppendProperty, - PropertyNameException, - ColumnIndexException, DatatypeEnum, Items, - FileType, - HDF5Schema -) - -from fairscape_cli.config import ( - FAIRSCAPE_URI ) - @click.group('schema') def schema(): """Invoke operations on dataset schema. """ pass - @schema.command('create-tabular') @click.option('--name', required=True, type=str) @click.option('--description', required=True, type=str) -@click.option('--guid', required=False, type=str, default="", show_default=False) +@click.option('--guid', required=False, type=str, default=None, show_default=False) @click.option('--separator', type=str, required=True) @click.option('--header', required=False, type=bool, default=False) @click.argument('schema_file', type=str) @@ -61,7 +49,6 @@ def create_tabular_schema( ): """Initialize a Tabular Schema. """ - # create the model try: schema_model = TabularValidationSchema.model_validate({ "name": name, @@ -80,8 +67,7 @@ def create_tabular_schema( ctx.exit(code=1) WriteSchema(schema_model, schema_file) - click.echo(f"Wrote Schema: {str(schema_file)}") - + click.echo(f"Wrote Schema: {str(schema_file)}") @schema.group('add-property') def add_property(): @@ -89,7 +75,6 @@ def add_property(): """ pass - @add_property.command('string') @click.option('--name', type=str, required=True) @click.option('--index', type=int, required=True) @@ -118,7 +103,6 @@ def add_property_string(ctx, name, index, description, value_url, pattern, schem ClickAppendProperty(ctx, schema_file, stringPropertyModel, name) - @add_property.command('number') @click.option('--name', type=str, required=True) @click.option('--index', type=int, required=True) @@ -141,7 +125,6 @@ def add_property_number(ctx, name, index, description, maximum, minimum, value_u "description": description, "valueURL": value_url }) - except ValidationError as metadataError: click.echo("ERROR Validating NumberProperty") for validationFailure in metadataError.errors(): @@ -150,7 +133,6 @@ def add_property_number(ctx, name, index, description, maximum, minimum, value_u ClickAppendProperty(ctx, schema_file, numberPropertyModel, name) - @add_property.command('boolean') @click.option('--name', type=str, required=True) @click.option('--index', type=int, required=True) @@ -169,7 +151,6 @@ def add_property_boolean(ctx, name, index, description, value_url, schema_file): "description": description, "valueURL": value_url }) - except ValidationError as metadataError: click.echo("ERROR Validating BooleanProperty") for validationFailure in metadataError.errors(): @@ -178,7 +159,6 @@ def add_property_boolean(ctx, name, index, description, value_url, schema_file): ClickAppendProperty(ctx, schema_file, booleanPropertyModel, name) - @add_property.command('integer') @click.option('--name', type=str, required=True) @click.option('--index', type=int, required=True) @@ -201,7 +181,6 @@ def add_property_integer(ctx, name, index, description, maximum, minimum, value_ "minimum": minimum, "valueURL": value_url }) - except ValidationError as metadataError: click.echo("ERROR Validating IntegerProperty") for validationFailure in metadataError.errors(): @@ -210,7 +189,6 @@ def add_property_integer(ctx, name, index, description, maximum, minimum, value_ ClickAppendProperty(ctx, schema_file, integerPropertyModel, name) - @add_property.command('array') @click.option('--name', type=str, required=True) @click.option('--index', type=str, required=True) @@ -244,7 +222,6 @@ def add_property_array(ctx, name, index, description, value_url, items_datatype, uniqueItems=unique_items, items=Items(datatype=datatype_enum) ) - except ValidationError as metadataError: print("ERROR: MetadataValidationError") for validationFailure in metadataError.errors(): @@ -253,12 +230,11 @@ def add_property_array(ctx, name, index, description, value_url, items_datatype, ClickAppendProperty(ctx, schema_file, arrayPropertyModel, name) - -def determine_schema_type(filepath: str) -> Type[Union[TabularValidationSchema, HDF5Schema]]: +def determine_schema_type(filepath: str) -> Type[Union[TabularValidationSchema, HDF5ValidationSchema]]: """Determine which schema type to use based on file extension""" ext = pathlib.Path(filepath).suffix.lower()[1:] if ext in ('h5', 'hdf5'): - return HDF5Schema + return HDF5ValidationSchema elif ext in ('csv', 'tsv', 'parquet'): return TabularValidationSchema else: @@ -270,7 +246,6 @@ def determine_schema_type(filepath: str) -> Type[Union[TabularValidationSchema, @click.pass_context def validate(ctx, schema, data): """Execute validation of a Schema against the provided data.""" - # Check if schema file exists (if not a default schema) if 'ark' not in schema: schema_path = pathlib.Path(schema) if not schema_path.exists(): @@ -283,39 +258,35 @@ def validate(ctx, schema, data): ctx.exit(1) try: - # Load the schema file with open(schema) as f: schema_json = json.load(f) - # Determine schema type based on the data file schema_class = determine_schema_type(data) - validation_schema = schema_class.model_validate(schema_json) + validation_schema = schema_class.from_dict(schema_json) - # Validate the file validation_errors = validation_schema.validate_file(data) if len(validation_errors) != 0: - # Create a pretty table of validation errors error_table = PrettyTable() - if isinstance(validation_schema, HDF5Schema): + if isinstance(validation_schema, HDF5ValidationSchema): error_table.field_names = ['path', 'error_type', 'failed_keyword', 'message'] else: error_table.field_names = ['row', 'error_type', 'failed_keyword', 'message'] for err in validation_errors: - if isinstance(validation_schema, HDF5Schema): + if isinstance(validation_schema, HDF5ValidationSchema): error_table.add_row([ - err.get("path"), - err.get("type"), - err.get("failed_keyword"), - str(err.get('message')) + err.path, + err.type, + err.failed_keyword, + str(err.message) ]) else: error_table.add_row([ - err.get("row"), - err.get("type"), - err.get("failed_keyword"), - str(err.get('message')) + err.row, + err.type, + err.failed_keyword, + str(err.message) ]) print(error_table) @@ -337,29 +308,24 @@ def validate(ctx, schema, data): @click.option('--name', required=True, type=str) @click.option('--description', required=True, type=str) @click.option('--guid', required=False, type=str, default="", show_default=False) -@click.option('--include-min-max', is_flag=True, help="Include min and max values for numeric and integer fields") @click.argument('input_file', type=click.Path(exists=True)) @click.argument('schema_file', type=str) @click.pass_context -def infer_schema(ctx, name, description, guid, include_min_max, input_file, schema_file): +def infer_schema(ctx, name, description, guid, input_file, schema_file): """Infer a schema from a file (CSV, TSV, Parquet, or HDF5).""" try: - # Determine which schema type to use based on input file schema_class = determine_schema_type(input_file) - # Infer the schema schema_model = schema_class.infer_from_file( input_file, name, - description, - include_min_max + description ) if guid: schema_model.guid = guid WriteSchema(schema_model, schema_file) - # Get file type for display ext = pathlib.Path(input_file).suffix.lower()[1:] click.echo(f"Inferred Schema from {ext} file: {str(schema_file)}") diff --git a/tests/stats-compute-tests/numbers.csv b/tests/stats-compute-tests/numbers.csv new file mode 100644 index 0000000..aa9321c --- /dev/null +++ b/tests/stats-compute-tests/numbers.csv @@ -0,0 +1,11 @@ +column1,column2,column3 +1,0.557412965,0.015765057 +2,0.595715476,4.632460772 +3,1.000511292,0.516892255 +4,3.634542545,16.3678812 +5,0.216278402,0.37567848 +6,3.346647036,3.666700797 +7,2.864322316,2.292766985 +8,0.508136324,0.434491093 +9,5.934758558,1.647603341 +10,1.092459463,1.04885126 \ No newline at end of file diff --git a/tests/stats-compute-tests/summary.py b/tests/stats-compute-tests/summary.py new file mode 100644 index 0000000..633bb35 --- /dev/null +++ b/tests/stats-compute-tests/summary.py @@ -0,0 +1,58 @@ +import pandas as pd +import sys +import os +from pathlib import Path + +def generate_summary_stats(input_path, output_dir): + """ + Generate summary statistics for a CSV file and save to output directory + + Parameters: + input_path (str): Path to input CSV file + output_dir (str): Directory to save output summary statistics + """ + # Read the input file + df = pd.read_csv(input_path) + + # Create summary statistics + summary_stats = pd.DataFrame({ + 'column_name': df.columns, + 'data_type': df.dtypes.astype(str), + 'count': df.count(), + 'null_count': df.isnull().sum(), + 'null_percentage': (df.isnull().sum() / len(df) * 100).round(2), + 'unique_values': df.nunique(), + }) + + # Add numeric column statistics + numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns + summary_stats.loc[summary_stats['column_name'].isin(numeric_cols), 'mean'] = df[numeric_cols].mean() + summary_stats.loc[summary_stats['column_name'].isin(numeric_cols), 'std'] = df[numeric_cols].std() + summary_stats.loc[summary_stats['column_name'].isin(numeric_cols), 'min'] = df[numeric_cols].min() + summary_stats.loc[summary_stats['column_name'].isin(numeric_cols), 'max'] = df[numeric_cols].max() + + # Create output directory if it doesn't exist + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Generate output filename from input filename + input_filename = os.path.basename(input_path) + output_filename = f"summary_stats_{input_filename}" + output_path = os.path.join(output_dir, output_filename) + + # Save summary statistics + summary_stats.to_csv(output_path, index=False) + print(f"Summary statistics saved to: {output_path}") + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python summary.py ") + sys.exit(1) + + input_path = sys.argv[1] + output_dir = sys.argv[2] + + try: + generate_summary_stats(input_path, output_dir) + except Exception as e: + print(f"Error: {str(e)}") + sys.exit(1) \ No newline at end of file diff --git a/tests/test_compute_stats.py b/tests/test_compute_stats.py new file mode 100644 index 0000000..e5c5711 --- /dev/null +++ b/tests/test_compute_stats.py @@ -0,0 +1,204 @@ +import os +import sys +import pathlib +import json +import shutil +import unittest +import subprocess +import datetime +from typing import Tuple + +class TestStatisticsCliWorkflow(unittest.TestCase): + + def setUp(self): + # Create test directory + self.test_dir = pathlib.Path.cwd() / 'tests' / 'stats-compute-tests' + self.test_dir.mkdir(parents=True, exist_ok=True) + + def tearDown(self): + # Only remove the generated files, not the entire directory + metadata_file = self.test_dir / 'ro-crate-metadata.json' + stats_file = self.test_dir / 'summary_stats_numbers.csv' + summary_file = self.test_dir / 'fake_summary.csv' + + if metadata_file.exists(): + metadata_file.unlink() + if stats_file.exists(): + stats_file.unlink() + if summary_file.exists(): + summary_file.unlink() + + def run_cli_command(self, command: str) -> Tuple[int, str, str]: + """Run a CLI command and return returncode, stdout, stderr""" + process = subprocess.Popen( + command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + stdout, stderr = process.communicate() + return process.returncode, stdout.strip(), stderr.strip() + + def test_cli_workflow(self): + # Change to test directory + os.chdir(self.test_dir) + + # Initialize ROCrate + init_cmd = '''python -m fairscape_cli rocrate init \ + --name "Data Analysis Project" \ + --organization-name "My Organization" \ + --project-name "Data Analysis" \ + --description "A project for analyzing data using summary statistics" \ + --keywords "data-analysis" --keywords "statistics" --keywords "python"''' + + returncode, stdout, stderr = self.run_cli_command(init_cmd) + self.assertEqual(returncode, 0, f"ROCrate init failed: {stderr}") + rocrate_guid = stdout.strip() + + # Register software + software_cmd = f'''python -m fairscape_cli rocrate register software ./ \ + --name "Summary Statistics Generator" \ + --author "Your Name" \ + --version "1.0.0" \ + --description "Python script that generates summary statistics for CSV data" \ + --keywords "data-analysis" --keywords "statistics" --keywords "python" \ + --file-format "text/x-python" \ + --date-modified "{datetime.date.today().isoformat()}" \ + --filepath "summary.py"''' + + returncode, stdout, stderr = self.run_cli_command(software_cmd) + self.assertEqual(returncode, 0, f"Software registration failed: {stderr}") + software_guid = stdout.strip() + + # Register dataset + dataset_cmd = f'''python -m fairscape_cli rocrate register dataset ./ \ + --name "Analysis Dataset" \ + --author "Your Name" \ + --version "1.0.0" \ + --date-published "{datetime.date.today().isoformat()}" \ + --description "Dataset for statistical analysis" \ + --keywords "data-analysis" --keywords "statistics" --keywords "python" \ + --data-format "text/csv" \ + --filepath "numbers.csv"''' + + returncode, stdout, stderr = self.run_cli_command(dataset_cmd) + self.assertEqual(returncode, 0, f"Dataset registration failed: {stderr}") + dataset_guid = stdout.strip() + + # Compute statistics + compute_cmd = f'''python -m fairscape_cli rocrate compute-statistics ./ \ + --dataset-id "{dataset_guid}" \ + --software-id "{software_guid}" \ + --command "python"''' + + returncode, stdout, stderr = self.run_cli_command(compute_cmd) + self.assertEqual(returncode, 0, f"Computation failed: {stderr}") + computation_guid = stdout.strip() + + # Verify the metadata file exists and has correct structure + metadata_file = self.test_dir / 'ro-crate-metadata.json' + self.assertTrue(metadata_file.exists()) + + # Load and verify metadata + with open(metadata_file) as f: + metadata = json.load(f) + + # Basic structure tests + self.assertEqual(metadata['name'], "Data Analysis Project") + self.assertEqual(metadata['@id'], rocrate_guid) + + # Verify all components are present in @graph + guids = [item['@id'] for item in metadata['@graph']] + self.assertIn(software_guid, guids) + self.assertIn(dataset_guid, guids) + self.assertIn(computation_guid, guids) + + # Find computation record + computation = next(item for item in metadata['@graph'] if item['@id'] == computation_guid) + + # Verify computation relationships + self.assertEqual(computation['usedSoftware'], [software_guid]) + self.assertEqual(computation['usedDataset'], [dataset_guid]) + self.assertTrue(len(computation['generated']) > 0) + + # Verify output file exists + output_file = self.test_dir / 'summary_stats_numbers.csv' + self.assertTrue(output_file.exists()) + + # Find dataset record and verify it has summary statistics + dataset = next(item for item in metadata['@graph'] if item['@id'] == dataset_guid) + self.assertTrue('hasSummaryStatistics' in dataset) + self.assertEqual(dataset['hasSummaryStatistics'], computation['generated']) + + def test_dataset_with_summary_stats(self): + # Change to test directory + os.chdir(self.test_dir) + + # Initialize ROCrate + init_cmd = '''python -m fairscape_cli rocrate init \ + --name "Dataset Summary Test" \ + --organization-name "Test Organization" \ + --project-name "Summary Stats Test" \ + --description "Testing dataset registration with summary statistics" \ + --keywords "data" --keywords "testing" --keywords "summary-stats"''' + + returncode, stdout, stderr = self.run_cli_command(init_cmd) + self.assertEqual(returncode, 0, f"ROCrate init failed: {stderr}") + rocrate_guid = stdout.strip() + + # Create fake summary file + summary_path = self.test_dir / 'fake_summary.csv' + with open(summary_path, 'w') as f: + f.write("statistic,value\nmean,42.0\nmedian,41.5\nstd,5.2") + + # Register dataset with summary statistics + dataset_cmd = f'''python -m fairscape_cli rocrate register dataset ./ \ + --name "Test Dataset" \ + --author "Test Author" \ + --version "1.0.0" \ + --date-published "{datetime.date.today().isoformat()}" \ + --description "Dataset with pre-existing summary statistics" \ + --keywords "data" --keywords "testing" \ + --data-format "text/csv" \ + --filepath "numbers.csv" \ + --summary-statistics-filepath "fake_summary.csv"''' + + returncode, stdout, stderr = self.run_cli_command(dataset_cmd) + self.assertEqual(returncode, 0, f"Dataset registration failed: {stderr}") + dataset_guid = stdout.strip() + + # Verify the metadata file exists and has correct structure + metadata_file = self.test_dir / 'ro-crate-metadata.json' + self.assertTrue(metadata_file.exists()) + + # Load and verify metadata + with open(metadata_file) as f: + metadata = json.load(f) + + # Find dataset record and verify it has summary statistics + dataset = next(item for item in metadata['@graph'] if item['@id'] == dataset_guid) + + # Get summary stats ID + summary_stats_id = dataset['hasSummaryStatistics'] + + # Find the summary statistics dataset in the graph - with more flexible matching + summary_stats = next( + (item for item in metadata['@graph'] + if 'stats' in item['@id'] and item['@type'] == 'https://w3id.org/EVI#Dataset'), + None + ) + self.assertEqual(summary_stats['@type'], 'https://w3id.org/EVI#Dataset') + self.assertTrue('stats' in summary_stats['@id']) + self.assertEqual(summary_stats['author'], 'Test Author') + + computation = next( + (item for item in metadata['@graph'] + if item['@type'] == 'https://w3id.org/EVI#Computation' and summary_stats_id in item.get('generated', [])), + None + ) + self.assertIsNotNone(computation) + self.assertEqual(computation['usedDataset'], [dataset_guid]) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_rocrate_api.py b/tests/test_rocrate_api.py index d19fbae..2aec4d5 100644 --- a/tests/test_rocrate_api.py +++ b/tests/test_rocrate_api.py @@ -2,6 +2,7 @@ import sys import pathlib import json +import shutil sys.path.insert( 0, @@ -19,145 +20,145 @@ from fairscape_cli.models.dataset import GenerateDataset from fairscape_cli.models.software import GenerateSoftware from fairscape_cli.models.rocrate import ( - GenerateROCrate, - ReadROCrateMetadata, - AppendCrate + GenerateROCrate, + ReadROCrateMetadata, + AppendCrate ) from sqids import Sqids class TestAPI(unittest.TestCase): - - def test_api(self): - rocratePath = pathlib.Path.cwd() / 'tests'/ 'data' / 'test_api' - - # delete the test_api folder - metadataFile = rocratePath / 'ro-crate-metadata.json' - metadataFile.unlink() - - rocrate_metadata = { - "guid": "ark:59853/UVA/B2AI/rocrate_test", - "name": 'test rocrate', - "organizationName": "UVA", - "projectName": "B2AI", - "description": "Testing ROCrate Model", - "keywords": ["test", "fair"], - "path": rocratePath - } - - # touch a file for the dataset to say exists - - rocrate = GenerateROCrate(**rocrate_metadata) - - software_metadata={ - "guid" : "955cf26c-e3a3-4f0f-b2df-fca4c693cac4:cm4ai_chromatin_mda-mb-468_untreated_ifimage_0.7alpha", - "author": "Cell Maps team", - "url": "https://github.com/idekerlab/cellmaps_utils", - "name": "cellmaps_utils", - "keywords": [ - "CM4AI", - "0.7alpha", - "MDA-MB-468", - "untreated", - "IF microscopy", - "images", - "breast; mammary gland", - "chromatin", - "tools", - "cellmaps_utils" - ], - "description": "CM4AI 0.7alpha MDA-MB-468 untreated IF microscopy images breast; mammary gland chromatin Contains utilities needed by Cell Maps tools", - "dateModified": "2024-10-22", - "version": "0.5.0", - "fileFormat": "py", - "usedByComputation": [], - "associatedPublication": None, - "additionalDocumentation": None, - "filepath": "https://github.com/idekerlab/cellmaps_utils", - "cratePath": rocratePath - } - software = GenerateSoftware(**software_metadata) - - yellowFolder = rocratePath / 'yellow' - yellowFolder.mkdir(exist_ok=True) - - # create 10k identifiers - datasetList = [] - #for i in range(100000): - # fileName = f'B2AI_5_untreated_B5_R5_z01_yellow_{i}.jpg' - # datasetFilePath = yellowFolder / fileName - # datasetFilePath.touch(exist_ok=True) - - for i in range(10000): - fileName = f'B2AI_5_untreated_B5_R5_z01_yellow_{i}.jpg' - datasetMetadata = { - "guid": "322ab5a2-e6a7-4c46-be79-cbf3e9453cde:cm4ai_chromatin_mda-mb-468_untreated_ifimage_0.7alpha", - "name": "B2AI_5_untreated_B5_R5_z01_yellow.jpg yellow channel image", - "keywords": [ - "CM4AI", - "0.7alpha", - "MDA-MB-468", - "untreated", - "IF microscopy", - "images", - "breast; mammary gland", - "chromatin", - "yellow", - "IF", - "image", - "ER (Calreticulin antibody)" - ], - "description": "CM4AI 0.7alpha MDA-MB-468 untreated IF microscopy images breast; mammary gland chromatin IF image file", - "author": "Lundberg Lab", - "datePublished": "2024-10-22", - "version": "0.7alpha", - "dataFormat": "jpg", - "generatedBy": [], - "derivedFrom": [], - "usedBy": [], - "url": None, - "associatedPublication": None, - "additionalDocumentation": None, - "schema": None, - "filepath": f"file:///yellow/{fileName}", - "cratePath": rocratePath - } - dataset = GenerateDataset(**datasetMetadata) - datasetList.append(dataset) - - AppendCrate(rocratePath, datasetList) - - # read in the crate metadata - rocrateMetadataRecord = ReadROCrateMetadata(rocratePath) - rocrateGUIDs = [ elem.guid for elem in rocrateMetadataRecord.metadataGraph] - - # assert that all dataset guids are present - for ds in datasetList: - assert ds.guid in rocrateGUIDs - - computation_metadata = { - "guid": "test guid", - "name": "Image Compression", - "runBy": "Chris Churas", - "command": "./test.sh", - "dateCreated": "10-28-2024", - "description": "A placeholder computation for image compression", - "keywords": ["cm4ai", "image"], - "usedSoftware": software.guid, - "usedDataset": [ds.guid for ds in datasetList], - "generated": None - } - computation = GenerateComputation(**computation_metadata) - AppendCrate(rocratePath, [software, computation]) - - # read in ROCrate - rocrateMetadataRecord = ReadROCrateMetadata(rocratePath) - rocrateGUIDs = [ elem.guid for elem in rocrateMetadataRecord.metadataGraph] - - assert computation.guid in rocrateGUIDs - assert software.guid in rocrateGUIDs - - - + + def setUp(self): + # Create test directory structure + self.rocratePath = pathlib.Path.cwd() / 'tests' / 'data' / 'test_api' + self.rocratePath.mkdir(parents=True, exist_ok=True) + + def tearDown(self): + # Clean up test directory after tests + pass + # if self.rocratePath.exists(): + # shutil.rmtree(self.rocratePath) + + def test_api(self): + # Clean start - safely handle metadata file deletion + metadataFile = self.rocratePath / 'ro-crate-metadata.json' + if metadataFile.exists(): + metadataFile.unlink() + + rocrate_metadata = { + "guid": "ark:59853/UVA/B2AI/rocrate_test", + "name": 'test rocrate', + "organizationName": "UVA", + "projectName": "B2AI", + "description": "Testing ROCrate Model", + "keywords": ["test", "fair"], + "path": self.rocratePath + } + + rocrate = GenerateROCrate(**rocrate_metadata) + + software_metadata = { + "guid": "955cf26c-e3a3-4f0f-b2df-fca4c693cac4:cm4ai_chromatin_mda-mb-468_untreated_ifimage_0.7alpha", + "author": "Cell Maps team", + "url": "https://github.com/idekerlab/cellmaps_utils", + "name": "cellmaps_utils", + "keywords": [ + "CM4AI", + "0.7alpha", + "MDA-MB-468", + "untreated", + "IF microscopy", + "images", + "breast; mammary gland", + "chromatin", + "tools", + "cellmaps_utils" + ], + "description": "CM4AI 0.7alpha MDA-MB-468 untreated IF microscopy images breast; mammary gland chromatin Contains utilities needed by Cell Maps tools", + "dateModified": "2024-10-22", + "version": "0.5.0", + "fileFormat": "py", + "usedByComputation": [], + "associatedPublication": None, + "additionalDocumentation": None, + "filepath": "https://github.com/idekerlab/cellmaps_utils", + "cratePath": self.rocratePath + } + software = GenerateSoftware(**software_metadata) + + yellowFolder = self.rocratePath / 'yellow' + yellowFolder.mkdir(exist_ok=True) + + # Create datasets + datasetList = [] + for i in range(10000): + fileName = f'B2AI_5_untreated_B5_R5_z01_yellow_{i}.jpg' + datasetMetadata = { + "guid": f"322ab5a2-e6a7-4c46-be79-cbf3e9453cde:cm4ai_chromatin_mda-mb-468_untreated_ifimage_0.7alpha_{i}", # Make unique + "name": f"B2AI_5_untreated_B5_R5_z01_yellow_{i}.jpg yellow channel image", + "keywords": [ + "CM4AI", + "0.7alpha", + "MDA-MB-468", + "untreated", + "IF microscopy", + "images", + "breast; mammary gland", + "chromatin", + "yellow", + "IF", + "image", + "ER (Calreticulin antibody)" + ], + "description": "CM4AI 0.7alpha MDA-MB-468 untreated IF microscopy images breast; mammary gland chromatin IF image file", + "author": "Lundberg Lab", + "datePublished": "2024-10-22", + "version": "0.7alpha", + "dataFormat": "jpg", + "generatedBy": [], + "derivedFrom": [], + "usedBy": [], + "url": None, + "associatedPublication": None, + "additionalDocumentation": None, + "schema": None, + "filepath": f"file:///yellow/{fileName}", + "cratePath": self.rocratePath + } + dataset = GenerateDataset(**datasetMetadata) + datasetList.append(dataset) + + AppendCrate(self.rocratePath, datasetList) + + # Verify crate metadata + rocrateMetadataRecord = ReadROCrateMetadata(self.rocratePath) + rocrateGUIDs = [elem.guid for elem in rocrateMetadataRecord.metadataGraph] + + # Verify all dataset GUIDs are present + for ds in datasetList: + self.assertIn(ds.guid, rocrateGUIDs, f"Dataset GUID {ds.guid} not found in metadata") + + computation_metadata = { + "guid": "test-computation-guid", # Made more specific + "name": "Image Compression", + "runBy": "Chris Churas", + "command": "./test.sh", + "dateCreated": "10-28-2024", + "description": "A placeholder computation for image compression", + "keywords": ["cm4ai", "image"], + "usedSoftware": software.guid, + "usedDataset": [ds.guid for ds in datasetList], + "generated": None + } + computation = GenerateComputation(**computation_metadata) + AppendCrate(self.rocratePath, [software, computation]) + + # Final verification + rocrateMetadataRecord = ReadROCrateMetadata(self.rocratePath) + rocrateGUIDs = [elem.guid for elem in rocrateMetadataRecord.metadataGraph] + + self.assertIn(computation.guid, rocrateGUIDs, "Computation GUID not found in metadata") + self.assertIn(software.guid, rocrateGUIDs, "Software GUID not found in metadata") if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() \ No newline at end of file