diff --git a/README.md b/README.md index 1399c709..128460bc 100644 --- a/README.md +++ b/README.md @@ -37,11 +37,5 @@ The repository structure is organized as follows: - Inclusion of over 25 new slots. - 5 new enumerations: EnumClinicalDataSourceType, EnumDataCategory, EnumGuidType, EnumParticipantLifespanStage, EnumResearchDomain. -### CLI Enhancements: -- **Validation**: Streamlines data cleaning and validation via the command line (CLI), allowing users to specify the data type and file path. The CLI reads, cleans, and validates data using LinkML-defined models for robust validation. For more details, use: - -```bash -validate-data --help -``` diff --git a/src/data_validation/README.md b/src/data_validation/README.md new file mode 100644 index 00000000..617def7c --- /dev/null +++ b/src/data_validation/README.md @@ -0,0 +1,61 @@ +# LinkML Schema Linting and Validation + +This project uses [LinkML](https://linkml.io/) to define schemas and validate tabular data files. Below are the commands used to lint and validate schemas and data using `linkml-lint` and `linkml-validate`. + +--- + +## 🔍 Schema Linting + +We use `linkml-lint` to check for syntax and structure issues in the schema. + +### 1. Full Linting Check + +```bash +linkml-lint src/linkml/include_schema.yaml +``` +- **More Info**: [linkml-lint CLI](https://linkml.io/linkml/cli/lint.html) + +✅ Data Validation +----------------- + +We use `linkml-validate` to check whether data files conform to their schema definitions. + +### 1. Validate `study.csv` against `Study` class +```bash +linkml-validate -s src/linkml/include_schema.yaml -C Study src/data/input/study.csv +``` +### 2. Validate `participant.csv` against `Participant` class +```bash +linkml-validate -s src/linkml/include_schema.yaml -C Participant src/data/input/participant.csv +``` +### 3. Validate `condition.csv` against `Condition` class +```bash +linkml-validate -s src/linkml/include_schema.yaml -C Condition src/data/input/condition.csv +``` +### 4. Validate `biospecimen.csv` against `Biospecimen` class +```bash +linkml-validate -s src/linkml/include_schema.yaml -C Biospecimen src/data/input/biospecimen.csv +``` +### 5. Validate `datafile.csv` against `DataFile` class +```bash +linkml-validate -s src/linkml/include_schema.yaml -C DataFile src/data/input/datafile.csv +``` + +- **More Info**: [linkml-validate CLI](https://linkml.io/linkml/cli/validate.html) + +### 📤 Saving Validation Logs + +To save validation output to a file (e.g., for documentation or reporting), redirect the output of `linkml-validate`: + +```bash +linkml-validate -s src/linkml/include_schema.yaml -C Study src/data/input/study.csv > src/data/output/validation-report.md +``` +You can change the extension to .csv, .txt, or .json based on your preferred format. + +📌 Notes + +- Ensure all required fields are present in your CSV files. + +- Column names in SCV files must match the schema slot names. + +- The schema file (src/linkml/include_schema.yaml) must define all referenced classes (Study, Participant, etc.). \ No newline at end of file diff --git a/src/data_validation/__init__.py b/src/data_validation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/data_validation/cli.py b/src/data_validation/cli.py deleted file mode 100644 index 31300b39..00000000 --- a/src/data_validation/cli.py +++ /dev/null @@ -1,43 +0,0 @@ -import argparse -from .validation import ( - validate_study, - validate_participant, - validate_condition, - validate_biospecimen, - validate_datafile, - validate_dataset, - validate_datasetmanifest -) - -# Dictionary to map entity names to validation functions -entity_validators = { - 'study': validate_study, - 'participant': validate_participant, - 'condition': validate_condition, - 'biospecimen': validate_biospecimen, - 'datafile': validate_datafile, - 'dataset': validate_dataset, - 'datasetmanifest': validate_datasetmanifest -} - - -def main(): - parser = argparse.ArgumentParser(description='Validate data from a CSV file using Pydantic models') - parser.add_argument('input_file', help='Path to the input CSV file') - parser.add_argument('-o', '--output', help='Path to the directory to save error logs') - parser.add_argument('entity', choices=entity_validators.keys(), help='Entity to validate') - args = parser.parse_args() - - # Print a friendly prompt to indicate processing - print(f"Validating {args.entity} data from file: {args.input_file}") - - # Retrieve the appropriate validation function based on the specified entity - validation_function = entity_validators[args.entity] - validation_function(args.input_file, args.output) - - # Print a friendly message indicating completion - print("Validation complete!") - - -if __name__ == "__main__": - main() diff --git a/src/data_validation/validate_biospecimen.py b/src/data_validation/validate_biospecimen.py deleted file mode 100644 index df8702b5..00000000 --- a/src/data_validation/validate_biospecimen.py +++ /dev/null @@ -1,37 +0,0 @@ -from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen -from src.data_validation.validation_utils import handle_nan -from pydantic import ValidationError - -def validate_biospecimen_entry(row): - try: - instance = Biospecimen( - studyCode=handle_nan(row['study code']), - participantGlobalId=handle_nan(row['participant global id']), - participantExternalId=handle_nan(row['participant external id']), - sampleGlobalId=handle_nan(row['sample global id']), - sampleExternalId=handle_nan(row['sample external id']), - sampleType=handle_nan(row['sample type']), - ageAtBiospecimenCollection=handle_nan(row['age at biospecimen collection']), - parentSampleGlobalId=handle_nan(row['parent sample global id']), - parentSampleExternalId=handle_nan(row['parent sample external id']), - parentSampleType=handle_nan(row['parent sample type']), - collectionGlobalId=handle_nan(row['collection global id']), - collectionExternalId=handle_nan(row['collection external id']), - collectionSampleType=handle_nan(row['collection sample type']), - containerGlobalId=handle_nan(row['container global id']), - containerExternalId=handle_nan(row['container external id']), - volume=handle_nan(row['volume']), - volumeUnit=handle_nan(row['volume unit']), - concentration=handle_nan(row['concentration']), - concentrationUnit=handle_nan(row['concentration unit']), - laboratoryProcedure=handle_nan(row['laboratory procedure']), - biospecimenStorage=handle_nan(row['biospecimen storage']), - sampleAvailability=row['sample availability'], - containerAvailability=row['container availability'] - ) - # Validation successful - return True, None - except ValidationError as e: - # Validation failed - error_details = (str(row['study code']) + "-" + str(row['sample external id']), e) - return False, error_details diff --git a/src/data_validation/validate_condition.py b/src/data_validation/validate_condition.py deleted file mode 100644 index 28d8e351..00000000 --- a/src/data_validation/validate_condition.py +++ /dev/null @@ -1,34 +0,0 @@ -from src.include_linkml.include_pydantic import Study, Participant, Condition -from src.data_validation.validation_utils import handle_nan -from pydantic import ValidationError - -def validate_condition_entry(row): - try: - instance = Condition( - studyCode=row['study code'], - participantGlobalId=handle_nan(row['participant global id']), - participantExternalId=handle_nan(row['participant external id']), - eventId=handle_nan(row['event id']), - eventType=handle_nan(row['event type']), - conditionMeasureSourceText=handle_nan(row['condition or measure source text']), - ageAtConditionMeasureObservation=handle_nan(row['age at condition or measure observation']), - conditionInterpretation=row['condition interpretation'], - conditionStatus=row['condition status'], - conditionDataSource=row['condition data source'], - hpoLabel=handle_nan(row['hpo label']), - hpoCode=handle_nan(row['hpo code']), - mondoLabel=handle_nan(row['mondo label']), - mondoCode=handle_nan(row['mondo code']), - maxoLabel=handle_nan(row['maxo label']), - maxoCode=handle_nan(row['maxo code']), - otherLabel=handle_nan(row['other label']), - otherCode=handle_nan(row['other code']), - measureValue=handle_nan(row['measure value']), - measureUnit=handle_nan(row['measure unit']) - ) - # Validation successful - return True, None - except ValidationError as e: - # Validation failed - error_details = (str(row['study code']) + "-" + str(row['participant external id']) + "-" + str(row['event id']), e) - return False, error_details diff --git a/src/data_validation/validate_datafile.py b/src/data_validation/validate_datafile.py deleted file mode 100644 index 2be199f8..00000000 --- a/src/data_validation/validate_datafile.py +++ /dev/null @@ -1,33 +0,0 @@ -from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile -from src.data_validation.validation_utils import handle_nan -from pydantic import ValidationError - -def validate_datafile_entry(row): - try: - instance = DataFile( - studyCode=row['study code'], - participantGlobalId=handle_nan(row['participant global id']), - participantExternalId=handle_nan(row['participant external id']), - sampleGlobalId=handle_nan(row['sample global id']), - sampleExternalId=handle_nan(row['sample external id']), - fileName=handle_nan(row['file name']), - fileGlobalId=handle_nan(row['file global id']), - fileS3Location=handle_nan(row['file s3 location']), - fileUploadLocation=handle_nan(row['file upload location']), - drsUri=handle_nan(row['drs uri']), - fileHash=handle_nan(row['file hash']), - dataAccess=row['data access'], - dataCategory=row['data category'], - dataType=handle_nan(row['data type']), - experimentalStrategy=row['experimental strategy'].split('|') if handle_nan(row['experimental strategy']) else [], - experimentalPlatform=row['experimental platform'].split('|') if handle_nan(row['experimental platform']) else [], - fileFormat=handle_nan(row['file format']), - fileSize=handle_nan(row['file size']), - fileSizeUnit=handle_nan(row['file size unit']) - ) - # Validation successful - return True, None - except ValidationError as e: - # Validation failed - error_details = (str(row['sample external id']) + "-" + str(row['file global id']), e) - return False, error_details diff --git a/src/data_validation/validate_dataset.py b/src/data_validation/validate_dataset.py deleted file mode 100644 index e5474586..00000000 --- a/src/data_validation/validate_dataset.py +++ /dev/null @@ -1,34 +0,0 @@ -from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile, Dataset -from src.data_validation.validation_utils import handle_nan -from pydantic import ValidationError - -def validate_dataset_entry(row): - try: - instance = Dataset( - studyCode=row['study code'], - datasetName=handle_nan(row['dataset name']), - datasetDescription=handle_nan(row['dataset description']), - datasetGlobalId=handle_nan(row['dataset global id']), - datasetExternalId=handle_nan(row['dataset external id']), - expectedNumberOfParticipants=handle_nan(row['expected number of participants']), - expectedNumberOfFiles=handle_nan(row['expected number of files']), - dataCollectionStartYear=handle_nan(row['data collection start year']), - dataCollectionEndYear=handle_nan(row['data collection end year']), - dataCategory=row['data category'].split('|') if handle_nan(row['data category']) else [], - dataType=row['data type'].split('|') if handle_nan(row['data type']) else [], - experimentalStrategy=row['experimental strategy'].split('|') if handle_nan(row['experimental strategy']) else [], - experimentalPlatform=row['experimental platform'].split('|') if handle_nan(row['experimental platform']) else [], - publication=row['publication'].split('|') if handle_nan(row['publication']) else [], - accessLimitations=handle_nan(row['access limitations']), - accessRequirements=handle_nan(row['access requirements']), - dbgap=row['dbgap'].split('|') if handle_nan(row['dbgap']) else [], - otherRepository=handle_nan(row['other repository']), - otherAccessAuthority=handle_nan(row['other access authority']), - isHarmonized=bool(row['is harmonized?']) - ) - # Validation successful - return True, None - except ValidationError as e: - # Validation failed - error_details = (row['dataset name'] + "-" + str(row['dataset external id']), e) - return False, error_details diff --git a/src/data_validation/validate_datasetmanifest.py b/src/data_validation/validate_datasetmanifest.py deleted file mode 100644 index ed50d08f..00000000 --- a/src/data_validation/validate_datasetmanifest.py +++ /dev/null @@ -1,20 +0,0 @@ -from src.include_linkml.include_pydantic import Study, Participant, Condition, Biospecimen, DataFile, Dataset, DatasetManifest -from src.data_validation.validation_utils import handle_nan -from pydantic import ValidationError - -def validate_datasetmanifest_entry(row): - try: - instance = DatasetManifest( - studyCode=row['study code'], - datasetName=handle_nan(row['dataset name']), - datasetGlobalId=handle_nan(row['dataset global id']), - datasetExternalId=handle_nan(row['dataset external id']), - fileName=handle_nan(row['file name']), - fileGlobalId=handle_nan(row['file global id']) - ) - # Validation successful - return True, None - except ValidationError as e: - # Validation failed - error_details = (str(row['dataset name']) + "-" + str(row['dataset external id']), e) - return False, error_details diff --git a/src/data_validation/validate_participant.py b/src/data_validation/validate_participant.py deleted file mode 100644 index 4b0465e1..00000000 --- a/src/data_validation/validate_participant.py +++ /dev/null @@ -1,32 +0,0 @@ -from src.include_linkml.include_pydantic import Study, Participant -from src.data_validation.validation_utils import handle_nan -from pydantic import ValidationError - -def validate_participant_entry(row): - try: - instance = Participant( - studyCode=row['study code'], - participantGlobalId=handle_nan(row['participant global id']), - participantExternalId=handle_nan(row['participant external id']), - familyId=handle_nan(row['family id']), - familyType=row['family type'], - fatherId=handle_nan(row['father id']), - motherId=handle_nan(row['mother id']), - siblingId=handle_nan(row['sibling id']), - otherFamilyMemberId=handle_nan(row['other family member id']), - familyRelationship=row['family relationship'], - sex=row['sex'], - race=row['race'], - ethnicity=row['ethnicity'], - downSyndromeStatus=row['down syndrome status'], - ageAtFirstPatientEngagement=handle_nan(row['age at first patient engagement']), - firstPatientEngagementEvent=handle_nan(row['first patient engagement event']), - outcomesVitalStatus=row['outcomes vital status'], - ageAtLastVitalStatus=handle_nan(row['age at last vital status']) - ) - # Validation successful - return True, None - except ValidationError as e: - # Validation failed - error_details = (str(row['study code']) + "-" + str(row['participant external id']), e) - return False, error_details diff --git a/src/data_validation/validate_study.py b/src/data_validation/validate_study.py deleted file mode 100644 index 25bfba98..00000000 --- a/src/data_validation/validate_study.py +++ /dev/null @@ -1,40 +0,0 @@ -from src.include_linkml.include_pydantic import Study -from src.data_validation.validation_utils import handle_nan -from pydantic import ValidationError - - -def validate_study_entry(row): - try: - instance = Study( - studyCode = row['study code'], - studyTitle = handle_nan(row['study title']), - program = row['program'].split('|') if handle_nan(row['program']) else [], - studyDescription = handle_nan(row['study description']), - principalInvestigatorName = handle_nan(row['principal investigator name']).split('|') if handle_nan(row['principal investigator name']) else [], - studyContactName = handle_nan(row['study contact name']).split('|') if handle_nan(row['study contact name']) else [], - studyContactInstitution = handle_nan(row['study contact institution']).split('|') if handle_nan(row['study contact institution']) else [], - studyContactEmail = handle_nan(row['study contact email']).split('|') if handle_nan(row['study contact email']) else [], - vbrEmail = handle_nan(row['vbr email']), - vbrUrl = handle_nan(row['vbr url']), - vbrReadme = handle_nan(row['vbr readme']), - researchDomain = row['research domain'].split('|') if handle_nan(row['research domain']) else [], - participantLifespanStage = row['participant lifespan stage'].split('|') if handle_nan(row['participant lifespan stage']) else [], - selectionCriteria = handle_nan(row['selection criteria']), - studyDesign = row['study design'].split('|') if handle_nan(row['study design']) else [], - clinicalDataSourceType = row['clinical data source type'].split('|') if handle_nan(row['clinical data source type']) else [], - dataCategory = row['data category'].split('|') if handle_nan(row['data category']) else [], - studyWebsite = handle_nan(row['study website']), - dbgap = row['dbgap'].split('|') if handle_nan(row['dbgap']) else [], - publication = str(row['publication']).split('|') if handle_nan(row['publication']) else [], - expectedNumberOfParticipants = handle_nan(row['expected number of participants']), - guidType = row['guid type'], - guidMapped = bool(row['guid mapped']), - acknowledgments = row['acknowledgments'].split('|') if handle_nan(row['acknowledgments']) else [], - citationStatement = row['citation statement'].split('|') if handle_nan(row['citation statement']) else [] - ) - # Validation successful - return True, None - except ValidationError as e: - # Validation failed - error_details = (str(row['study code']), e) - return False, error_details \ No newline at end of file diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py deleted file mode 100644 index 27a3dfb9..00000000 --- a/src/data_validation/validation.py +++ /dev/null @@ -1,45 +0,0 @@ -from src.data_validation.validation_utils import validate_data -from src.data_validation.validate_study import validate_study_entry -from src.data_validation.validate_participant import validate_participant_entry -from src.data_validation.validate_condition import validate_condition_entry -from src.data_validation.validate_biospecimen import validate_biospecimen_entry -from src.data_validation.validate_datafile import validate_datafile_entry -from src.data_validation.validate_dataset import validate_dataset_entry -from src.data_validation.validate_datasetmanifest import validate_datasetmanifest_entry - - -def validate_study(file_path, output_path='.'): - string_columns = ['study code', 'program', 'research domain', 'participant lifespan stage', - 'study design', 'clinical data source type', 'data category', 'guid type'] - return validate_data(file_path, string_columns, validate_study_entry, output_path) - - -def validate_participant(file_path, output_path='.'): - string_columns = ['study code', 'family type', 'family relationship', 'sex', 'race', 'ethnicity', - 'down syndrome status', 'outcomes vital status'] - return validate_data(file_path, string_columns, validate_participant_entry, output_path) - - -def validate_condition(file_path, output_path='.'): - string_columns = ['study code', 'condition interpretation', 'condition status', 'condition data source'] - return validate_data(file_path, string_columns, validate_condition_entry, output_path) - - -def validate_biospecimen(file_path, output_path='.'): - string_columns = ['study code', 'sample availability', 'container availability'] - return validate_data(file_path, string_columns, validate_biospecimen_entry, output_path) - - -def validate_datafile(file_path, output_path='.'): - string_columns = ['study code', 'data access', 'data category'] - return validate_data(file_path, string_columns, validate_datafile_entry, output_path) - - -def validate_dataset(file_path, output_path='.'): - string_columns = ['study code', 'data category'] - return validate_data(file_path, string_columns, validate_dataset_entry, output_path) - - -def validate_datasetmanifest(file_path, output_path='.'): - string_columns = ['study code'] - return validate_data(file_path, string_columns, validate_datasetmanifest_entry, output_path) diff --git a/src/data_validation/validation_utils.py b/src/data_validation/validation_utils.py deleted file mode 100644 index 4dbdce62..00000000 --- a/src/data_validation/validation_utils.py +++ /dev/null @@ -1,77 +0,0 @@ -import pandas as pd -import os -from datetime import datetime -import chardet - - -def clean_string(value): - if isinstance(value, str): - return value.lower().replace(' ', '_').replace('-', '_').replace('/', '_') - elif pd.isna(value): - return None - else: - return str(value).lower().replace(' ', '_').replace('-', '_').replace('/', '_') - - -def clean_dataframe_strings(df, string_columns): - df[string_columns] = df[string_columns].map(clean_string) - - -def validate_dataframe(df, entry_validator, input_file_name=None, output_path=None): - # Check if required columns exist before validation - try: - validation_results = df.apply(entry_validator, axis=1) - except KeyError as e: - print(f"Error: Missing column - {e}") - return 0, df.shape[0] # All records are invalid - - valid_count = validation_results[validation_results.apply(lambda x: x[0])].shape[0] - invalid_count = validation_results.shape[0] - valid_count - print("Number of errors by record type:") - for is_valid, error_info in validation_results: - if not is_valid: - print(f"{error_info[0]}: {str(error_info[1]).split()[0]}") - total_records = df.shape[0] - print(f"Total number of records in the file: {total_records}") - print(f"Number of records with error: {invalid_count}") - if output_path: - output_file_path = save_validation_results(validation_results, input_file_name, output_path) - print(f"Validation results saved to: {output_file_path}") - return valid_count, invalid_count - - -def save_validation_results(validation_results, input_file_name, output_path): - os.makedirs(output_path, exist_ok=True) - current_date = datetime.now().strftime("%Y-%m-%d") - output_file_name = f'{input_file_name}_validation_results_{current_date}.txt' if input_file_name else f'validation_results_{current_date}.txt' - output_file_path = os.path.join(output_path, output_file_name) - validation_results_str = [str(item) for item in validation_results] - with open(output_file_path, 'w') as file: - file.write('\n'.join(validation_results_str)) - return output_file_path - -def detect_encoding(file_path): - with open(file_path, 'rb') as f: - result = chardet.detect(f.read()) - return result['encoding'] - -def read_csv_file(file_path): - encoding = detect_encoding(file_path) - df = pd.read_csv(file_path, encoding=encoding) - df.columns = df.columns.str.lower() # Convert column names to lower case - return df - - -def validate_data(file_path, string_columns, validation_function, output_path='.'): - file_name = os.path.basename(file_path) - df = read_csv_file(file_path) - clean_dataframe_strings(df, string_columns) - valid_count, invalid_count = validate_dataframe(df, validation_function, input_file_name=file_name, - output_path=output_path) - return valid_count, invalid_count - -def handle_nan(value): - """Convert NaN values to None""" - if pd.isna(value): - return None - return value