Skip to content
10 changes: 8 additions & 2 deletions src/fairscape_cli/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from fairscape_cli.models.dataset import (
Dataset,
GenerateDataset
GenerateDataset,
generateSummaryStatsElements,
registerOutputs
)
from fairscape_cli.models.software import Software, GenerateSoftware
from fairscape_cli.models.computation import Computation, GenerateComputation
Expand All @@ -9,13 +11,16 @@
GenerateROCrate,
ReadROCrateMetadata,
AppendCrate,
CopyToROCrate
CopyToROCrate,
UpdateCrate
)
from fairscape_cli.models.bagit import BagIt

__all__ = [
'Dataset',
'GenerateDataset',
'generateSummaryStatsElements',
'registerOutputs',
'Software',
'GenerateSoftware',
'Computation',
Expand All @@ -25,5 +30,6 @@
'ReadROCrateMetadata',
'AppendCrate',
'CopyToROCrate',
'UpdateCrate',
'BagIt'
]
23 changes: 8 additions & 15 deletions src/fairscape_cli/models/computation.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
from fairscape_cli.models.base import FairscapeBaseModel
from fairscape_cli.models.utils import GenerateDatetimeSquid
from fairscape_cli.config import NAAN

from typing import (
Optional,
List,
Union,
Dict,
)
from pydantic import (
Field,
AnyUrl
)
import re
from datetime import datetime
from typing import Optional, List, Union, Dict

from pydantic import Field, AnyUrl

from fairscape_cli.config import NAAN
from fairscape_cli.models.base import FairscapeBaseModel
from fairscape_cli.models.guid_utils import GenerateDatetimeSquid


class Computation(FairscapeBaseModel):
Expand Down Expand Up @@ -67,7 +60,7 @@ def GenerateComputation(
computation_model = Computation.model_validate(
{
"@id": guid,
"@type": "https://w2id.org/EVI#Computation",
"@type": "https://w3id.org/EVI#Computation",
"name": name,
"description": description,
"keywords": keywords,
Expand Down
168 changes: 132 additions & 36 deletions src/fairscape_cli/models/dataset.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,7 @@
from fairscape_cli.models.base import (
FairscapeBaseModel,
Identifier
)
from fairscape_cli.config import (
NAAN
)
from fairscape_cli.models.utils import GenerateDatetimeSquid, FileNotInCrateException
from fairscape_cli.models.schema.tabular import (
TabularValidationSchema
)

# Standard library imports
import pathlib
from typing import (
Optional,
List,
Union,
Dict
)
from datetime import datetime
from typing import Optional, List, Union, Dict, Tuple, Set

from pydantic import (
BaseModel,
Expand All @@ -25,7 +10,10 @@
AnyUrl,
field_serializer
)
from datetime import datetime

from fairscape_cli.models.base import FairscapeBaseModel
from fairscape_cli.models.guid_utils import GenerateDatetimeSquid
from fairscape_cli.config import NAAN


class Dataset(FairscapeBaseModel):
Expand All @@ -44,6 +32,7 @@ class Dataset(FairscapeBaseModel):
derivedFrom: Optional[List[str]] = Field(default=[])
usedBy: Optional[List[str]] = Field(default=[])
contentUrl: Optional[str] = Field(default=None)
hasSummaryStatistics: Optional[Union[str, List[str]]] = Field(default=None)

#@field_serializer('datePublished')
#def serialize_date_published(self, datePublished: datetime):
Expand All @@ -68,11 +57,13 @@ def GenerateDataset(
usedBy: Optional[List[str]],
generatedBy: Optional[List[str]],
filepath: Optional[str],
cratePath
cratePath,
summary_stats_guid: Optional[str] = None
):

sq = GenerateDatetimeSquid()
guid = f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-{sq}"
if not guid:
sq = GenerateDatetimeSquid()
guid = f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-{sq}"

datasetMetadata = {
"@id": guid,
Expand All @@ -88,22 +79,14 @@ def GenerateDataset(
"additionalDocumentation": additionalDocumentation,
"format": dataFormat,
"schema": schema,
# sanitize input lists of newline breaks
"derivedFrom": [
derived.strip("\n") for derived in derivedFrom
],
"usedBy": [
used.strip("\n") for used in usedBy
],
"generatedBy": [
gen.strip("\n") for gen in generatedBy
]
"derivedFrom": [derived.strip("\n") for derived in derivedFrom],
"usedBy": [used.strip("\n") for used in usedBy],
"generatedBy": [gen.strip("\n") for gen in generatedBy],
"hasSummaryStatistics": summary_stats_guid
}

datasetMetadata['contentURL'] = setRelativeFilepath(cratePath, filepath)

datasetMetadata['contentUrl'] = setRelativeFilepath(cratePath, filepath)
datasetInstance = Dataset.model_validate(datasetMetadata)

return datasetInstance


Expand Down Expand Up @@ -136,4 +119,117 @@ def setRelativeFilepath(cratePath, filePath):
# if relative filepath
datasetPath = pathlib.Path(filePath).absolute()
relativePath = datasetPath.relative_to(rocratePath)
return f"file:///{str(relativePath)}"
return f"file:///{str(relativePath)}"


from fairscape_cli.models.computation import GenerateComputation, Computation
def generateSummaryStatsElements(
name: str,
author: str,
keywords: List[str],
date_published: str,
version: str,
associated_publication: Optional[str],
additional_documentation: Optional[str],
schema: Optional[str],
dataset_guid: str,
summary_statistics_filepath: str,
crate_path: pathlib.Path
) -> Tuple[str, Dataset, Computation]:
"""Generate summary statistics dataset and computation elements

Args:
name: Name of the main dataset
author: Author of the dataset
keywords: Dataset keywords
date_published: Publication date
version: Dataset version
associated_publication: Optional associated publication
additional_documentation: Optional additional documentation
schema: Optional schema
dataset_guid: GUID of the main dataset
summary_statistics_filepath: Path to summary statistics file
crate_path: Path to RO-Crate

Returns:
Tuple containing:
- Summary statistics GUID
- Summary statistics Dataset instance
- Computation instance that generated the summary statistics
"""
# Generate GUIDs
sq_stats = GenerateDatetimeSquid()
summary_stats_guid = f"ark:{NAAN}/dataset-{name.lower().replace(' ', '-')}-stats-{sq_stats}"

sq_comp = GenerateDatetimeSquid()
computation_guid = f"ark:{NAAN}/computation-{name.lower().replace(' ', '-')}-stats-{sq_comp}"

# Create computation instance
computation_instance = GenerateComputation(
guid=computation_guid,
name=f"Summary Statistics Computation for {name}",
runBy=author,
command="",
dateCreated=date_published,
description=f"Computation that generated summary statistics for dataset: {name}",
keywords=keywords,
usedSoftware=[],
usedDataset=[dataset_guid],
generated=[summary_stats_guid]
)

# Create summary statistics dataset
summary_stats_instance = GenerateDataset(
guid=summary_stats_guid,
url=None,
author=author,
name=f"{name} - Summary Statistics",
description=f"Summary statistics for dataset: {name}",
keywords=keywords,
datePublished=date_published,
version=version,
associatedPublication=associated_publication,
additionalDocumentation=additional_documentation,
dataFormat='pdf',
schema=schema,
derivedFrom=[],
generatedBy=[computation_guid],
usedBy=[],
filepath=summary_statistics_filepath,
cratePath=crate_path,
summary_stats_guid=None
)

return summary_stats_guid, summary_stats_instance, computation_instance

def registerOutputs(
new_files: Set[pathlib.Path],
computation_id: str,
dataset_id: str,
author: str
) -> List[Dict]:
"""Register all outputs as datasets"""
output_instances = []
for file_path in new_files:
file_path_str = str(file_path)
output_instance = GenerateDataset(
guid=None,
name=f"Statistics Output - {file_path.name}",
author=author, # Use the original author
description=f"Statistical analysis output for {dataset_id}",
keywords=["statistics"],
datePublished=datetime.now().isoformat(),
version="1.0",
dataFormat=file_path.suffix[1:],
filepath=file_path_str,
cratePath=str(file_path.parent),
url=None,
associatedPublication=None,
additionalDocumentation=None,
schema=None,
derivedFrom=[],
usedBy=[],
generatedBy=[computation_id]
)
output_instances.append(output_instance)
return output_instances
31 changes: 31 additions & 0 deletions src/fairscape_cli/models/guid_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from sqids import Sqids
import random
import datetime

from typing import Set, Dict, List, Optional, Tuple

from fairscape_cli.config import NAAN

squids = Sqids(min_length=6)

def GenerateDatetimeSquid():
try:
timestamp_int = int(datetime.datetime.now(datetime.UTC).timestamp())
sq = squids.encode([timestamp_int, random.randint(0, 10000)])
except:
timestamp_int = int(datetime.datetime.utcnow().timestamp())
sq = squids.encode([timestamp_int])
return sq

def GenerateDatetimeGUID(prefix: str)->str:
try:
timestamp_int = int(datetime.datetime.now(datetime.UTC).timestamp())
sq = squids.encode([timestamp_int])
except:
timestamp_int = int(datetime.datetime.utcnow().timestamp())
sq = squids.encode([timestamp_int])
return f"ark:{NAAN}/{prefix}-{sq}"

def GenerateGUID(data: List[int], prefix: str)-> str:
squid_encoded = squids.encode(data)
return f"ark:{NAAN}/{prefix}-{squid_encoded}"
61 changes: 38 additions & 23 deletions src/fairscape_cli/models/rocrate.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,16 @@
from fairscape_cli.models import (
Software,
Dataset,
Computation
)
from fairscape_cli.models.utils import GenerateDatetimeSquid
from fairscape_cli.config import (
DEFAULT_CONTEXT,
NAAN
)

import pathlib
import shutil
import json
from typing import Optional, Union, List, Literal, Dict

from prettytable import PrettyTable
from pydantic import (
BaseModel,
computed_field,
Field,
)
from typing import (
Optional,
Union,
List,
Literal,
Dict
)
from pydantic import BaseModel, computed_field, Field

from fairscape_cli.config import NAAN, DEFAULT_CONTEXT
from fairscape_cli.models.software import Software
from fairscape_cli.models.dataset import Dataset
from fairscape_cli.models.computation import Computation
from fairscape_cli.models.guid_utils import GenerateDatetimeSquid

class ROCrateMetadata(BaseModel):
guid: Optional[str] = Field(alias="@id", default=None)
Expand Down Expand Up @@ -321,3 +307,32 @@ def CopyToROCrate(source_filepath: str, destination_filepath: str):
# copy the file into the destinationPath
shutil.copy(source_path, destination_path)

def UpdateCrate(
cratePath: pathlib.Path,
element: Union[Dataset, Software, Computation]
):
"""Update an existing element in the RO-Crate metadata by matching @id

Args:
cratePath: Path to the RO-Crate directory or metadata file
element: Updated element to replace existing one with matching @id
"""
if cratePath.is_dir():
cratePath = cratePath / 'ro-crate-metadata.json'

with cratePath.open("r+") as rocrate_metadata_file:
rocrate_metadata = json.load(rocrate_metadata_file)

# Find and replace the element with matching @id
for i, existing in enumerate(rocrate_metadata['@graph']):
if existing.get('@id') == element.guid:
rocrate_metadata['@graph'][i] = element.model_dump(
by_alias=True,
exclude_none=True
)
break

# Write back the updated metadata
rocrate_metadata_file.seek(0)
rocrate_metadata_file.truncate()
json.dump(rocrate_metadata, rocrate_metadata_file, indent=2)
Loading
Loading