diff --git a/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Classes_Slots.tsv b/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Classes_Slots.tsv deleted file mode 100644 index 2a307f3b..00000000 --- a/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Classes_Slots.tsv +++ /dev/null @@ -1,74 +0,0 @@ -class: string slots slot: string required: 0..1 x boolean slot_required: 0..1 x boolean title: 0..1 x string is_a: 0..1 x definition definition_uri: 0..1 x uriorcurie description: 0..1 (recommended) x string domain_of: 0..* x class_definition from_schema: 0..1 x uri notes: 0..* x string range: 0..1 x element requires_component: string -> class slots slot annotations required title is_a definition_uri description domain_of from_schema notes range annotations -> "internal_separator: ""|""" inner_key: required "internal_separator: ""|""" "internal_separator: ""|""" inner_key: requires_component - access_url False include:access_url Storage location for this file DataFile https://w3id.org/include - age_at_biospecimen_collection False include:age_at_biospecimen_collection Age in days of participant at time of biospecimen collection Biospecimen https://w3id.org/include - age_at_condition_observation FALSE include:age_at_condition_observation Age in days at which condition was observed, recorded, or diagnosed Condition https://w3id.org/include - age_at_last_vital_status False include:age_at_last_vital_status Age in days when participant's vital status was last recorded Participant https://w3id.org/include - biospecimen_storage False include:biospecimen_storage Method by which Container is stored (e.g. -80C freezer, Liquid nitrogen, etc.) Biospecimen https://w3id.org/include - collection_id False include:collection_id Identifier for the eldest sample in a lineage of processed, pooled, or aliquoted samples. This may be the same as Parent Sample ID or Sample ID (if no processing was performed). Biospecimen|DataFile https://w3id.org/include - collection_sample_type False include:collection_sample_type Type of biological material comprising the collected sample (e.g. Whole blood, Bone marrow, Saliva, etc.) Biospecimen https://w3id.org/include - condition_description TRUE include:condition_description Condition as worded by data contributor - this will be displayed in the portal Condition https://w3id.org/include - container_id False include:container_id Identifier for specific container/aliquot of sample, if applicable. For example, distinct aliquots of a sample will have the same Sample ID but different Container IDs. Biospecimen https://w3id.org/include - data_access False include:data_access Type of access control on this file, determined by DCC DataFile https://w3id.org/include enum_data_access - data_category True include:data_category General category of data in file (e.g. Clinical, Genomics, Proteomics, Metabolomics, Immune maps, Transcriptomics, etc.) DataFile https://w3id.org/include - data_type False include:data_type Specific type of data contained in file (e.g. Aligned reads, Unaligned reads, SNV, CNV, Gene fusions, Isoform expression, Gene expression quantification, Structural variations, Cytokine profiles, Operation reports, Pathology reports, Histology images, Clinical supplement, Protein expression quantification, etc.) DataFile https://w3id.org/include - dbgap False include:dbgap dbGaP study accession code Study https://w3id.org/include - mondo_label FALSE include:mondo_label Label for condition in the Mondo Disease Ontology (MONDO) Condition https://w3id.org/include - mondo_code FALSE include:mondo_code Code for condition in the Mondo Disease Ontology (MONDO) Condition https://w3id.org/include - down_syndrome_status True include:down_syndrome_status Down Syndrome status of participant (T21 = Trisomy 21; D21 = Disomy 21, euploid) Participant https://w3id.org/include enum_down_syndrome_status - ethnicity True include:ethnicity Ethnicity of participant Participant https://w3id.org/include enum_ethnicity - experimental_strategy False include:experimental_strategy Experimental method used to obtain data in file (e.g. WGS, RNAseq, WXS, SOMAscan, Mass spec proteomics, LCMS metabolomics, Multiplex immunoassay, Meso Scale Discovery, etc.) DataFile https://w3id.org/include - condition_interpretation FALSE include:condition_interpretation "Whether condition was observed or not. ""Not Observed"" indicates participant was specifically examined for that condition, or health record specifically queried for that condition, and found to be negative. Sept. 2022 release will only include positive assertions." Condition https://w3id.org/include enum_condition_interpretation - condition_data_source FALSE include:condition_data_source Whether condition information was obtained from medical records (Clinical) or patient survey (Self-Reported) Condition https://w3id.org/include enum_condition_data_source - external_id True include:external_id Unique identifier for the participant, assigned by data contributor Participant https://w3id.org/include - family_id False include:family_id Unique identifer for family to which Participant belongs Participant https://w3id.org/include - family_relationship False include:family_relationship Relationship of Participant to other family members Participant https://w3id.org/include Participant - family_type True include:family_type Structure of family members participating in the study (proband-only = no family members participating; duo = proband + parent; trio = proband + 2 parents; trio+ = proband + 2 parents + other relatives) Participant https://w3id.org/include enum_family_type - father_id False include:father_id Participant ID for Participant's father Participant https://w3id.org/include - file_id False include:file_id File identifier, assigned by DCC DataFile https://w3id.org/include - file_name True include:file_name Synapse ID for file DataFile https://w3id.org/include - format True include:format Format of file (e.g. bam, cram, vcf, csv, html, png, fastq, pdf, dicom, etc.) DataFile https://w3id.org/include - has_biospecimen False include:has_biospecimen Link to a Biospecimen DataFile https://w3id.org/include Biospecimen - has_output False include:has_output The DataFile Output of an Assay Assay https://w3id.org/include DataFile - has_datafile False include:has_datafile Link to a DataFile Biospecimen|Participant https://w3id.org/include DataFile - has_participant False include:has_participant Link to a Participant Biospecimen|DataFile|Condition https://w3id.org/include Participant - has_subject True include:has_subject Link from Data File to a Participant DataFile https://w3id.org/include Participant - has_parent_sample False include:has_parent_sample Link from a sample to its parent Sample Biospecimen https://w3id.org/include Biospecimen - has_study False include:has_study Link to a Study Biospecimen|DataFile|Participant https://w3id.org/include Study - laboratory_procedure False include:laboratory_procedure Procedure by which Sample was derived from Parent Sample (e.g. RBC lysis, Centrifugation, Ficoll, etc.) Biospecimen https://w3id.org/include - mother_id False include:mother_id Participant ID for Participant's mother Participant https://w3id.org/include - original_file_name True include:original_file_name Name of file, assigned by data contributor DataFile https://w3id.org/include - outcomes_vital_status False include:outcomes_vital_status Whether participant is alive or dead Participant https://w3id.org/include - parent_sample_id False include:parent_sample_id Identifier for the direct parent from which Sample was derived, processed, pooled, etc. (if applicable) Biospecimen https://w3id.org/include - parent_sample_type False include:parent_sample_type Type of biological material comprising the parent sample (e.g. Plasma, Serum, White blood cells, etc.) Biospecimen https://w3id.org/include - participant_id True include:participant_id Unique identifier for the participant, assigned by DCC DataFile|Participant https://w3id.org/include - hpo_label FALSE include:hpo_label Label for condition in the Human Phenotype Ontology (HPO) Condition https://w3id.org/include - hpo_code FALSE include:hpo_code Code for condition in the Human Phenotype Ontology (HPO) Condition https://w3id.org/include - maxo_label FALSE include:maxo_label Label for condition in the Medical Action Ontology (MAXO) Condition https://w3id.org/include - maxo_code FALSE include:maxo_code Code for condition in the Medical Action Ontology (MAXO) Condition https://w3id.org/include - other_label FALSE include:other_label Label for condition in another ontology (if no match in HPO, MONDO, or MAXO) Condition https://w3id.org/include - other_code FALSE include:other_code Code for condition in another ontology (if no match in HPO, MONDO, or MAXO) Condition https://w3id.org/include - program True include:program Funding source for the study Study https://w3id.org/include enum_program - race True include:race Race of participant Participant https://w3id.org/include enum_race - sample_availability False include:sample_availability Whether or not the sample is potentially available for sharing through the Virtual Biorepository Biospecimen https://w3id.org/include enum_sample_availability - sample_id True include:sample_id Identifier for sample. A sample is a unique biological material; two samples with two different IDs are biologically distinct. Biospecimen https://w3id.org/include - sample_type True include:sample_type Type of biological material comprising the sample (e.g. Plasma, Serum, White blood cells, DNA, RNA, etc.) Biospecimen https://w3id.org/include - sex True include:sex Sex of participant Participant https://w3id.org/include enum_sex - size False include:size Size of file DataFile https://w3id.org/include - study_code True include:study_code Unique identifer for the study, assigned by DCC Study https://w3id.org/include enum_study_code - study_name True include:study_name Name of the study, chosen by data contributor Study https://w3id.org/include - validation_rules False Validation Rules include:validation_rules Rules for Validation of Property Constraints and Values Thing https://w3id.org/include - volume False include:volume Amount of sample in container Biospecimen https://w3id.org/include - volume_unit False include:volume_unit Unit of sample volume Biospecimen https://w3id.org/include - uses_biospecimen False include:uses_biospecimen The Biospecimen an Assay is performed on Assay https://w3id.org/include Biospecimen - has_aliquot False include:has_aliquot An aliquot of a sample Biospecimen https://w3id.org/include Aliquot -Biospecimen age_at_biospecimen_collection|biospecimen_storage|collection_id|collection_sample_type|container_id|has_datafile|has_participant|has_study|laboratory_procedure|parent_sample_id|parent_sample_type|sample_availability|sample_id|sample_type|volume|volume_unit|has_study|has_aliquot False Biospecimen Thing include:Biospecimen A Biospecimen Collected from A Participant https://w3id.org/include Study,Participant,DataFile -DataFile access_url|collection_id|data_access|data_category|data_type|experimental_strategy|file_id|file_name|format|has_biospecimen|has_participant|has_study|participant_id|size|original_file_name False DataFile Thing include:DataFile A DataFile Associated with a Participant or Study or Biospecimen https://w3id.org/include Study,Participant,Biospecimen -Participant age_at_diagnosis|age_at_phenotype_assignment|age_at_the_last_vital_status|diagnosis_icd|diagnosis_mondo|diagnosis_ncit|diagnosis_source_text|diagnosis_type|down_syndrome_status|ethnicity|external_id|family_id|family_relationship|family_type|father_id|has_datafile|has_study|mother_id|outcomes_vital_status|participant_id|phenotype_hpo|phenotype_source_text|phenotype_interpretation|race|sex False Participant Thing include:Participant A Participant in a Study https://w3id.org/include Study,DataFile -Study dbgap|program|study_code|study_name True Study Thing include:Study A Study https://w3id.org/include -Thing Thing schema:Thing Highest Level Class https://w3id.org/include -FamilyGroup has_participant False FamilyGroup Thing include:FamilyGroup A group of Participants in the same Study https://w3id.org/include Study,Participant -Aliquot False Aliquot Thing include:Aliquot An aliquot of a sample https://w3id.org/include Biospecimen -Assay uses_biospecimen|has_output False Assay Thing include:Assay An assay https://w3id.org/include Biospecimen,DataFile -Condition has_particpant diff --git a/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Enums.tsv b/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Enums.tsv deleted file mode 100644 index a910797a..00000000 --- a/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Enums.tsv +++ /dev/null @@ -1,70 +0,0 @@ -enum permissible_value title meaning aliases broad_mappings close_mappings comments definition_uri deprecated deprecated_element_has_exact_replacement deprecated_element_has_possible_replacement description exact_mappings examples from_schema imported_from in_language in_subset mappings narrow_mappings notes rank related_mappings see_also source todos -> enum permissible_value title meaning aliases broad_mappings close_mappings comments definition_uri deprecated deprecated_element_has_exact_replacement deprecated_element_has_possible_replacement description exact_mappings examples from_schema imported_from in_language in_subset mappings narrow_mappings notes rank related_mappings see_also source todos -> "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" "internal_separator: ""|""" -enum_data_access include:enum_data_access https://w3id.org/include -enum_data_access controlled Controlled -enum_data_access cpen Open -enum_data_access registered Registered -enum_down_syndrome_status include:enum_down_syndrome_status https://w3id.org/include -enum_down_syndrome_status d21 D21 -enum_down_syndrome_status t21 T21 MONDO:0008608 -enum_ethnicity include:enum_ethnicity https://w3id.org/include -enum_ethnicity asked_but_unknown Asked but unknown -enum_ethnicity hispanic_or_latino Hispanic or Latino -enum_ethnicity not_hispanic_or_latino Not Hispanic or Latino -enum_ethnicity prefer_not_to_answer Prefer not to answer -enum_ethnicity unknown Unknown -enum_family_type include:enum_family_type https://w3id.org/include -enum_family_type duo Duo -enum_family_type other Other -enum_family_type proband_only Proband-only -enum_family_type trio Trio -enum_family_type trio_plus Trio+ -enum_phenotype_interpretation include:enum_phenotype_interpretation https://w3id.org/include -enum_phenotype_interpretation not_observed Not Observed -enum_phenotype_interpretation observed Observed -enum_program include:enum_program https://w3id.org/include -enum_program include INCLUDE -enum_program kf KF -enum_race include:enum_race https://w3id.org/include -enum_race american_indian_or_alaskan_native American Indian or Alaska Native -enum_race asian Asian -enum_race black_or_african_american Black or African American -enum_race more_than_one_race More than one race -enum_race native_hawaiian_or_other_pacific_islander Native Hawaiian or Other Pacific Islander -enum_race other Other -enum_race white White -enum_race prefer_not_to_answer Prefer not to answer -enum_race Unknown -enum_sample_availability include:enum_sample_availability https://w3id.org/include -enum_sample_availability available Available -enum_sample_availability unavailable Unavailable -enum_sex include:enum_sex https://w3id.org/include -enum_sex female Female -enum_sex male Male -enum_sex other Other -enum_sex unknown Unknown -enum_study_code include:enum_study_code https://w3id.org/include -enum_study_code ds_cog_all DS-COG-ALL -enum_study_code ds_pcgc DS-PCGC -enum_study_code ds360_chd DS360-CHD -enum_study_code dsc DSC -enum_study_code htp HTP -enum_study_code abcds ABC-DS -enum_study_code ads ADS -enum_study_code ds_brain DS-Brain -enum_study_code ds_cog_aml DS-COG-AML -enum_study_code bri_dsr BRI-DSR -enum_study_code ds_isp DS-ISP -enum_study_code ds_nexus DS-Nexus -enum_study_code ds_pals DS-PALS -enum_study_code ds_sleep DS-Sleep -enum_study_code ecods ECODS -enum_study_code exceeds ExCEEDS -enum_study_code trc_ds TRC-DS -enum_study_code x01_desmith X01-DeSmith -enum_study_code x01_hakon X01-Hakon -enum_condition_interpretation observed Observed -enum_condition_interpretation not_observed Not Observed -enum_condition_data_source clinical Clinical -enum_condition_data_source self_reported Self-reported diff --git a/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Prefixes.tsv b/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Prefixes.tsv deleted file mode 100644 index 89b4499b..00000000 --- a/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Prefixes.tsv +++ /dev/null @@ -1,6 +0,0 @@ -prefix URI -> prefix prefix_reference -linkml https://w3id.org/linkml/ -include https://w3id.org/include/ -schema http://schema.org/ -sms http://sms.org/ diff --git a/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Schema.tsv b/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Schema.tsv deleted file mode 100644 index 5c958572..00000000 --- a/src/data/sheets/INCLUDE_Portal_V1_LinkML_Schema_Schema.tsv +++ /dev/null @@ -1,3 +0,0 @@ -Schema uri Desc Schema Prefix -> schema id description default_prefix -IncludePortalV1 https://w3id.org/include Initial Include Portal Schema include diff --git a/src/data/sheets/schemasheet.tsv b/src/data/sheets/schemasheet.tsv new file mode 100644 index 00000000..59110cb9 --- /dev/null +++ b/src/data/sheets/schemasheet.tsv @@ -0,0 +1,394 @@ +class slot title description range required multivalued enum permissible_value meaning +>class slot title description range required multivalued enum permissible_value meaning + studyCode Study Code Unique identifier for the study (generally a short acronym) enum_studyCode true + studyTitle Study Title Full title of the study string true + program Program Funding source(s) for the study (pipe-separated if multiple) enum_program true true + studyDescription Study Description Brief description of the study (2-4 sentences) string true + principalInvestigatorName Principal Investigator Name Name(s) of Principal Investigator(s) of this study; pipe-separated if multiple string true true + studyContactName Study Contact Name Name of contact person for this study; pipe-separated if multiple string true true + studyContactInstitution Study Contact Institution Institution of contact person for this study; pipe-separated if multiple string true true + studyContactEmail Study Contact Email Email address of contact person for this study; pipe-separated if multiple string true true + vbrEmail VBR Email Email address for Virtual Biorepository requests/inquiries, if participating string + vbrUrl VBR URL Link to Virtual Biorepository request form, if participating uri + vbrReadme VBR Readme Instructions for contacting or requesting samples from Virtual Biorepository, if participating string + researchDomain Research Domain Main research domain(s) of the study, other than Down syndrome; pipe-separated if multiple enum_researchDomain true true + participantLifespanStage Participant Lifespan Stage Focus age group(s) of the study population; pipe-separated if multiple enum_participantLifespanStage true true + selectionCriteria Selection Criteria A limited list of criteria for selection of participants in the study, provided in terms of inclusion and exclusion criteria. For Observational studies, a description of the population from which the groups or cohorts were selected (for example, primary care clinic, community sample, residents of a certain town). string + studyDesign Study Design Overall design of study, including whether it is longitudinal and whether family members/unrelated controls are also enrolled enum_studyDesign true true + clinicalDataSourceType Clinical Data Source Type Source(s) of data collected from study participants; pipe-separated if multiple enum_clinicalDataSourceType true true + studyWebsite Study Website Website for the study uri + publication Publication uri true + expectedNumberOfParticipants Expected Number of Participants integer true + guidType GUID Type System used to generate globally unique identifiers (GUIDs) enum_guidType true + guidMapped GUIDs Mapped? For studies using NDAR GUIDs, have the GUIDs been added to the INCLUDE GUID Mapping File? boolean + dbgap dbGaP string true + acknowledgments Acknowledgments Funding statement and acknowledgments for this study string true + citationStatement Citation Statement "Statement that secondary data users should use to acknowledge use of this dataset. E.g., ""The results analyzed and here are based in whole or in part upon data generated by the INCLUDE (INvestigation of Co-occurring conditions across the Lifespan to Understand Down syndromE) Project , and were accessed from the INCLUDE Data Hub and .""" string true + doi DOI Unique Digital Object Identifier for each Study and Dataset, minted by the DCC using DataCite uri false + doiCitation DOI Citation Bibliographic citation for DOI, generated by DataCite string false + nctId NCT ID "The unique identification code given to each clinical study upon registration at ClinicalTrials.gov. The format is ""NCT"" followed by an 8-digit number. Also known as ClinicalTrials.gov Identifier" string true false + clinicalStudyDesign Clinical Study Design The nature of the investigation or investigational use for which clinical study information is being submitted. Select one. enum_clinicalStudyDesign true false + trialPhase Trial Phase For a clinical trial of a drug product (including a biological product), the numerical phase of such clinical trial. Select only one. enum_trialPhase true false + primaryPurpose Primary Purpose The main objective of the intervention(s) being evaluated by the clinical trial. Select one. enum_primaryPurpose true false + interventionType Intervention Type For each intervention studied in the clinical study, the general type of intervention. Select one. enum_interventionType true false + intervention Intervention For interventional studies, specify the intervention(s) associated with each arm or group; at least one intervention must be specified for interventional studies. Use non-proprietary names where available. Multiple values should be pipe-separated. For observational studies, specify the intervention(s)/exposure(s) of interest, if any. string true true + armInformation Arm Information For interventional studies, a description of each arm of the clinical trial that indicates its role in the clinical trial (e.g. Experimental, Active Comparator, Placebo Comparator, Sham Comparator; No Intervention; Other); provides an informative title; and, if necessary, additional descriptive information (including which interventions are administered in each arm) to differentiate each arm from other arms in the clinical trial. Multiple values should be pipe-separated. For observational studies, specify the predefined participant groups (cohorts) to be studied, e.g. those with or without a condition/exposure. string true true + armAllocation Arm Allocation The method by which participants are assigned to arms in a clinical trial. enum_armAllocation true false + interventionAssignmentStrategy Intervention Assignment Strategy In an interventional study, the strategy for assigning interventions to participants. enum_interventionAssignmentStrategy true false + primaryOutcomeMeasure Primary Outcome Measure A description of each primary outcome measure (title, description, and time point/duration of assessment). Multiple values should be pipe-separated. string true true + secondaryOutcomeMeasure Secondary Outcome Measure A description of each secondary outcome measure (title, description, and time point/duration of assessment). Multiple values should be pipe-separated. string true true + datasetName Dataset Name Full name of the dataset, provided by contributor string true + datasetDescription Dataset Description Brief additional notes about the dataset (1-3 sentences) that are not already captured in the other fields string + datasetGlobalId Dataset Global ID Unique Global ID for dataset, generated by DCC string false + datasetExternalId Dataset External ID Unique identifier or code for dataset, if provided by contributor string + expectedNumberOfFiles Expected Number of Files Expected number of files associated with this dataset, including dictionaries. If additional explanation is needed, please add to Dataset Description field. integer false + dataCollectionStartYear Data Collection Start Year Year that data collection started string false + dataCollectionEndYear Data Collection End Year Year that data collection ended string false + accessLimitations Access Limitations Data access limitations, as defined in the GA4GH Data Use Ontology (DUO; can list more than one, pipe separated) string false + accessRequirements Access Requirements Data access requirements, as defined in the GA4GH Data Use Ontology (DUO; can list more than one, pipe separated) string false + otherRepository Other Repository URL if dataset is already deposited in a public repository other than dbGaP (e.g. LONI, Metabolomics Workbench, etc.) uri + otherAccessAuthority Other Access Authority Email or URL for dataset's Access Authority, if not dbGaP string + isHarmonized Is Harmonized All of the elements in this Dataset are harmonized and available in the INCLUDE Data Hub boolean + datasetManifestLocation Dataset Manifest Location Location of associated Dataset Manifest string + participantGlobalId Participant Global ID Unique INCLUDE global identifier for the participant, assigned by DCC string true + participantExternalId Participant External ID Unique, de-identified identifier for the participant, assigned by data contributor. External IDs must be two steps removed from personal information in the study records. string true + familyId Family ID Unique identifer for family to which Participant belongs, assigned by data contributor string + familyType Family Type Structure of family members participating in the study enum_familyType true + fatherId Father ID Participant External ID for Participant's father (NA if Participant is not the proband) string + motherId Mother ID Participant External ID for Participant's mother (NA if Participant is not the proband) string + siblingId Sibling ID Participant External ID for Participant's sibling(s) (NA if Participant is not the proband) string + otherFamilyMemberId Other Family Member ID Participant External ID for Participant's other family members (NA if Participant is not the proband) string + familyRelationship Family Relationship Relationship of Participant to proband enum_familyRelationship true + sex Sex Sex of Participant enum_sex true + race Race Race of Participant enum_race true + ethnicity Ethnicity Ethnicity of Participant enum_ethnicity true + downSyndromeStatus Down Syndrome Status Down Syndrome status of participant enum_downSyndromeStatus true + ageAtFirstParticipantEngagement Age at First Participant Engagement Age in days of Participant at first recorded study event (enrollment, visit, observation, sample collection, survey completion, etc.). Age at enrollment is preferred, if available. integer true + firstParticipantEngagementEvent First Participant Engagement Event Event for which Age at First Participant Engagement is given (e.g. enrollment, visit, observation, sample collection, survey completion, etc.). Age at enrollment is preferred, if available. string true + outcomesVitalStatus Outcomes Vital Status Whether participant is alive or dead enum_vital_status + ageAtLastVitalStatus Age at Last Vital Status Age in days when participant's vital status was last recorded integer + eventId Event ID Identifier for event (Visit, Survey completion, Sample collection, etc.) to which the Condition data are linked, if applicable. There may be multiple events linked to a Participant. string + eventType Event Type Type of event for which Event ID is given (Visit, Survey completion, Sample collection, etc.) string + conditionMeasureSourceText Condition or Measure Source Text Co-occurring Condition (phenotype or diagnosis) or Measure (observation with numeric value), as described by data contributor. The Down Syndrome Genetic Diagnosis will be rolled into this field. string + ageAtConditionMeasureObservation Age At Condition or Measure Observation Age in days at which Condition or Measure was observed, recorded, or diagnosed integer + conditionInterpretation Condition Interpretation Whether Condition was observed or not enum_conditionInterpretation + conditionStatus Condition Status Whether the Condition is ongoing, has been resolved, or this is a general history of the condition without known dates enum_conditionStatus + conditionDataSource Condition Data Source Whether Condition information was obtained by the investigator or reported by participant/family member enum_conditionDataSource + hpoLabel HPO Label Label for Condition in the Human Phenotype Ontology (HPO) string + hpoCode HPO Code Code for Condition in the Human Phenotype Ontology (HPO) string + mondoLabel MONDO Label Label for Condition in the Mondo Disease Ontology (MONDO) string + mondoCode MONDO Code Code for Condition in the Mondo Disease Ontology (Mondo) string + maxoLabel MAXO Label Label for Condition in the Medical Action Ontology (MAXO) string + maxoCode MAXO Code Code for condition in the Medical Action Ontology (MAXO) string + otherLabel Other Label Label for Condition in another ontology (if no match in HPO, MONDO, or MAXO) string + otherCode Other Code Code for Condition in another ontology (if no match in HPO, MONDO, or MAXO) string + measureValue Measure Value Numeric value of Measure float + measureUnit Measure Unit Unit that is associated with Measure Value (e.g. kg, cm, %, x10^9/L, etc.) string + sampleGlobalId Sample Global ID INCLUDE global identifier for sample, assigned by DCC string true + sampleExternalId Sample External ID Unique identifier for sample, assigned by data contributor. A sample is a unique biological material; two samples with two different IDs are biologically distinct. string true + sampleType Sample Type Type of biological material comprising the Sample (e.g. Plasma, White blood cells, Red blood cells, DNA, RNA, Peripheral blood mononuclear cells, CD4+ Tconv cells, NK cells, Monocytes, CD8+ T cells, B cells, Granulocytes, Treg cells) string true + ageAtBiospecimenCollection Age At Biospecimen Collection Age in days of participant at time of biospecimen collection integer + parentSampleGlobalId Parent Sample Global ID INCLUDE global identifier for the direct parent from which Sample was derived, assigned by DCC string + parentSampleExternalId Parent Sample External ID Identifier for the direct parent from which Sample was derived, processed, pooled, etc. (if applicable); assigned by data contributor string + parentSampleType Parent Sample Type Type of biological material comprising the Parent Sample (e.g. Peripheral Whole Blood, Derived Cell Line, Saliva, Whole blood, WBCs) string + collectionGlobalId Collection Global ID INCLUDE global identifier for the eldest sample in a lineage, assigned by DCC string + collectionExternalId Collection External ID Identifier for the eldest sample in a lineage of processed, pooled, or aliquoted samples - typically the material actually collected from the Participant. This may be the same as Parent Sample ID or Sample ID (if no processing was performed). Assigned by data contributor. string + collectionSampleType Collection Sample Type Type of biological material comprising the Collected Sample (e.g. Whole blood, Not reported, Saliva, Derived cell line) string + containerGlobalId Container Global ID INCLUDE global identifier for specific container/aliquot of sample, assigned by DCC string + containerExternalId Container External ID Identifier for specific container/aliquot of sample, assigned by data contributor. For example, distinct aliquots of a sample will have the same Sample ID but different Container IDs. string + volume Volume Amount of sample in container float + volumeUnit Volume Unit Unit of sample volume string + concentration Concentration Concentration of sample in container float + concentrationUnit Concentration Unit Unit of sample concentration string + laboratoryProcedure Laboratory Procedure Procedure by which Sample was derived from Parent Sample (e.g. Centrifugation, RBC lysis, Lyse/fix buffer, FACS, PAXgene DNA, PAXgene RNA, Qiagen Allprep, Ficoll) string + biospecimenStorage Biospecimen Storage Method by which Container is stored (e.g. Minus 80 degrees Celsius, Liquid nitrogen storage) string + sampleAvailability Sample Availability Whether or not the Sample (any Container thereof) is potentially available for sharing through the Virtual Biorepository enum_Availability true + containerAvailability Container Availability Whether or not the specific Container is potentially available for sharing through the Virtual Biorepository enum_Availability + participantDataFileManifestLocation Participant-DataFile Manifest Location Location of Participant-DataFile Manifest, if file contains multiple Participants string + fileName File Name Name of file, assigned by data contributor string true + fileGlobalId File Global ID INCLUDE global file identifier, assigned by DCC string true + fileUploadLocation File Upload Location Where source file was uploaded, if not directly to an S3 bucket (e.g. Synapse) string + fileS3Location File S3 Location S3 bucket location of file; also serves as dewrangle descriptor string true + drsUri DRS URI Data Repository Services API Uniform Resource Identifier uriorcurie true + fileHash File Hash md5 hash of this file for validation (if known) string + dataAccess Data Access Type of access control on this file, determined by DCC enum_dataAccess true + dataCategory Data Category enum_dataCategory true + dataType Data Type string true + experimentalStrategy Experimental Strategy string true + experimentalPlatform Experimental Platform Specific platform used to perform experiment; pipe-separated if multiple (e.g. SOMAscan, MSD, Luminex, Illumina) string true + fileFormat File Format Format of file (e.g. tsv, cram, gvcf, vcf, maf, txt, pdf, html, png) string true + fileSize File Size Size of file, if known (mainly important if large) integer + fileSizeUnit File Size Unit Unit of file size string + encounterGlobalId Encounter Global ID Unique identifier for Encounter (assigned by DCC) string true false + encounterExternalId Encounter External ID Unique ID or Name of Encounter. For actual visits, this might be a visit ID; for planned encounters, use a consistent format, such as string true false + encounterDescription Encounter Description Name and/or description of of Encounter string true false + encounterGroup Encounter Group Optional grouping for Encounters, e.g. treatment arms. This is a flexible field that different studies might use in different ways. string false true + encounterType Encounter Type Optional type of Encounter, e.g. pre/on/post-intervention, virtual vs. in-person, etc. This is a flexible field that different studies might use in different ways. string false true + seriesOrder Series Order Ordering of Encounter relative to others in group, if not already indicated by Timepoint (Study Schedule only) float false false + timepoint Timepoint Timepoint of Encounter relative to anchor (Study Schedule only -- timepoint will be provided by Participant Age at Encounter in actual data) float false false + timepointUnit Timepoint Unit Unit of timepoint (Study Schedule only) string false false + participantAgeAtEncounter Participant Age at Encounter Age in days of Participant at Encounter integer false false + activitiesPerformed Activities Performed External IDs of Activities performed at Encounter; separate with pipes if multiple string false true + samplesCollected Samples Collected External IDs of Samples collected at Encounter; separate with pipes if multiple string false true + filesGenerated Files Generated Names of Files generated during Encounter; separate with pipes if multiple string false true + activityGlobalId Activity Global ID Unique identifier for Activity (assigned by DCC) string true false + activityExternalId Activity External ID "Unique ID of Activity. Could be a ""fake"" activity for study schedule, but must formatted like an ID (e.g. no spaces). Must match Activity ID provided in DataFile metadata. E.g. Study Schedule activities, " string true false + activityDescription Activity Description Name and/or description of Activity string true false + inputClass Input Class Class of input for Activity (NA for study schedule) enum_inputOutputClass false true + inputType Input Type Type of input for Activity (NA for study schedule). Must match Type of Input Class (dataType, sampleType, etc.) string false true + outputClass Output Class Class of output generated by Activity (NA for study schedule) enum_inputOutputClass false true + outputType Output Type Type of output generated by Activity. Must match Type of Output Class. For generic/planned Study Schedule activities, use Output Type = Study Schedule string false true +Study Study General information about the study +Study dataCategory Categories of data expected to be collected in this study true +Study dbgap "dbGaP ""phs"" accession code(s) associated with this Study, either for access or informational purposes (pipe-separated if multiple)" +Study publication URL for publication(s) describing the study's rationale and methodology (PubMed Central preferred but not required; pipe-separated if multiple) +Study expectedNumberOfParticipants Expected number of participants in this study (or actual number, if data has been submitted to INCLUDE DCC). If additional explanation is needed, please add to Study Description field. +ClinicalTrial Clinical Trial Study-level metadata specific to a clinical trial, copied from the study's clinicaltrials.gov page +ClinicalTrial studyCode Study +Dataset Dataset Information about a specific grouping of data files +Dataset dataCategory General category of data in Dataset; pipe-separated if multiple true +Dataset dbgap "dbGaP ""phs"" accession code(s) required to access the files in this Dataset, if applicable (pipe-separated if multiple)" +Dataset publication URL for publication(s) describing the Dataset's rationale and methodology (PubMed Central preferred but not required; pipe-separated if multiple) +Dataset expectedNumberOfParticipants Expected number of participants in this Dataset (or actual number, if data has been submitted to INCLUDE DCC). If additional explanation is needed, please add to Dataset Description field. +Dataset dataType Specific type of data contained in Dataset; pipe-separated if multiple (e.g. Preprocessed metabolite relative abundance, Absolute protein concentration, Aligned reads, Simple nucleotide variations, GVCF, Gene expression quantifications, Gene fusions, Somatic copy number variations, Somatic structural variations) +Dataset experimentalStrategy Experimental method used to obtain data in Dataset; pipe-separated if multiple (e.g. Whole genome sequencing, RNAseq, Multiplex immunoassay, Mass spec metabolomics) true +Dataset studyCode Study +DatasetManifest Dataset Manifest Mapping information for files in Dataset +DatasetManifest studyCode Study +DatasetManifest fileGlobalId DataFile +DatasetManifest datasetGlobalId Dataset +ParticipantSampleDataFileManifest Participant-Sample-DataFile Manifest List of Participants and/or Samples in DataFiles with multiple Participants and/or Samples +ParticipantSampleDataFileManifest studyCode Study +ParticipantSampleDataFileManifest fileName DataFile +ParticipantSampleDataFileManifest participantExternalId Participant +ParticipantSampleDataFileManifest sampleExternalId Biospecimen +Participant Participant Demographic and clinical information about the participant +Condition Condition Co-occurring conditions and other observations for the participant +Condition studyCode Study +Condition participantGlobalId Participant +Biospecimen Biospecimen A Biospecimen Collected from A Participant +Biospecimen studyCode Study +Biospecimen participantGlobalId Participant +DataFile Data File Metadata about Data Files +DataFile dataCategory General category of data in file (e.g. Clinical, Genomics, Proteomics, Metabolomics, Immune profiling, Transcriptomics) +DataFile dataType Specific type of data contained in file (e.g. Preprocessed metabolite relative abundance, Absolute protein concentration, Aligned reads, Simple nucleotide variations, GVCF, Gene expression quantifications, Gene fusions, Somatic copy number variations, Somatic structural variations) +DataFile experimentalStrategy Experimental method used to obtain data in file (e.g. Whole genome sequencing, RNAseq, Multiplex immunoassay, Mass spec metabolomics) +DataFile studyCode Study +DataFile participantGlobalId Participant +DataFile sampleGlobalId Biospecimen +Encounter Encounter Encounters can describe generic planned visits (e.g. for a Clinical Trial study schedule) or actual participant events +Encounter participantExternalId External ID of associated participant (for actual encounters) false true +Encounter activityExternalId External IDs of Activities performed at Encounter; separate with pipes if multiple false true +Encounter sampleExternalId External IDs of Samples collected at Encounter; separate with pipes if multiple false true +Encounter fileName Names of Files generated during Encounter; separate with pipes if multiple false true +Encounter studyCode Study +Activity Activity Activities describe sample collection/processing, assays, data generation, etc. and may be planned (e.g. for a Clinical Trial study schedule) or actual +Activity studyCode Study + enum_studyCode + AADSC enum_studyCode aadsc + ABC-DS enum_studyCode abc_ds + ADS enum_studyCode ads + AECOM-DS enum_studyCode aecom_ds + BEST21 enum_studyCode best21 + BrainPower enum_studyCode brainpower + BRI-DSR enum_studyCode bri_dsr + CCDS enum_studyCode ccds + CHILD-DS enum_studyCode child_ds + CHARGE-DS enum_studyCode charge_ds + DECIDAS enum_studyCode decidas + DS-ARC enum_studyCode ds_arc + DS-Brain enum_studyCode ds_brain + DS-COG-ALL enum_studyCode ds_cog_all + DS-COG-AML enum_studyCode ds_cog_aml + DS-DETERMINED enum_studyCode ds_determined + DS-HOME enum_studyCode ds_home + DS-HSAT enum_studyCode ds_hsat + DS-ISP enum_studyCode ds_isp + DS-Nexus enum_studyCode ds_nexus + DS-PALS enum_studyCode ds_pals + DS-PCGC enum_studyCode ds_pcgc + DS-Sleep enum_studyCode ds_sleep + DS-VitE enum_studyCode ds_vite + DS360-CHD enum_studyCode ds360_chd + DSC enum_studyCode dsc + DSpostBFmulti enum_studyCode dspostbfmulti + DSRRS enum_studyCode dsrrs + ECODS enum_studyCode ecods + EXcEEDS enum_studyCode exceeds + HTP enum_studyCode htp + IBIS-DS enum_studyCode ibis-ds + JAKi-DS enum_studyCode jaki_ds + OPTimal enum_studyCode optimal + TEAM-DS enum_studyCode team_ds + TOMI enum_studyCode tomi + TRC-DS enum_studyCode trc_ds + X01-deSmith enum_studyCode x01_desmith + X01-Hakonarson enum_studyCode x01_hakonarson + enum_program + INCLUDE enum_program include + KF enum_program kf + Other enum_program other + enum_researchDomain + Behavior and Behavior Mechanisms enum_researchDomain behavior_and_behavior_mechanisms mesh:D001520 + Congenital Heart Defects enum_researchDomain congenital_heart_defects mesh:D006330 + Immune System Diseases enum_researchDomain immune_system_diseases mesh:D007154 + Hematologic Diseases enum_researchDomain hematologic_diseases mesh:D006402 + Neurodevelopment enum_researchDomain neurodevelopment mesh:D065886 + Sleep Wake Disorders enum_researchDomain sleep_wake_disorders mesh:D012893 + All Co-occurring Conditions enum_researchDomain all_co_occurring_conditions mesh:D013568 + Physical Fitness enum_researchDomain physical_fitness mesh:D010809 + Other enum_researchDomain other + enum_participantLifespanStage + Fetal enum_participantLifespanStage fetal + Neonatal 0-28 days old enum_participantLifespanStage neonatal + Pediatric Birth-17 years old enum_participantLifespanStage pediatric + Adult 18+ years old enum_participantLifespanStage adult + enum_studyDesign + Case-Control enum_studyDesign case_control + Case Set enum_studyDesign case_set + Control Set enum_studyDesign control_set + Clinical Trial enum_studyDesign clinical_trial + Cross-Sectional enum_studyDesign cross_sectional + Family/Twins/Trios enum_studyDesign family_twins_trios + Interventional enum_studyDesign interventional + Longitudinal enum_studyDesign longitudinal + Trial Readiness Study enum_studyDesign trial_readiness_study + Tumor vs Matched Normal enum_studyDesign tumor_vs_matched_normal + enum_clinicalDataSourceType + Medical Record Data obtained directly from medical record enum_clinicalDataSourceType medical_record + Investigator Assessment Data obtained by examination, interview, etc. with investigator enum_clinicalDataSourceType investigator_assessment + Participant or Caregiver Report Data obtained from survey, questionnaire, etc. filled out by participant or caregiver enum_clinicalDataSourceType participant_or_caregiver_report + Other Data obtained from other source, such as tissue bank enum_clinicalDataSourceType other + Unknown enum_clinicalDataSourceType unknown + enum_dataCategory + Unharmonized Demographic/Clinical Data enum_dataCategory unharmonized_demographic_clinical_data + Harmonized Demographic/Clinical Data enum_dataCategory harmonized_demographic_clinical_data + Genomics enum_dataCategory genomics + Transcriptomics enum_dataCategory transcriptomics + Epigenomics enum_dataCategory epigenomics + Proteomics enum_dataCategory proteomics + Metabolomics enum_dataCategory metabolomics + Cognitive/Behavioral enum_dataCategory cognitive_behavioral + Immune Profiling enum_dataCategory immune_profiling + Imaging enum_dataCategory imaging + Microbiome enum_dataCategory microbiome + Fitness enum_dataCategory fitness + Physical Activity enum_dataCategory physical_activity + Other enum_dataCategory other + Sleep Study enum_dataCategory sleep_study + enum_guidType + NDAR GUID generated by NIMH Data Archive (NDA) GUID tool enum_guidType ndar + Other GUID generated by other system enum_guidType other + No GUID No GUIDs used in this study enum_guidType no_guid + enum_clinicalStudyDesign + Interventional enum_clinicalStudyDesign interventional + Observational enum_clinicalStudyDesign observational + Patient Registry enum_clinicalStudyDesign patient_registry + Expanded Access enum_clinicalStudyDesign expanded_access + enum_trialPhase + Not Applicable enum_trialPhase not_applicable + Early Phase 1 enum_trialPhase early_phase_1 + Phase 1 enum_trialPhase phase_1 + Phase 1/2 enum_trialPhase phase_1_2 + Phase 2 enum_trialPhase phase_2 + Phase 2/3 enum_trialPhase phase_2_3 + Phase 3 enum_trialPhase phase_3 + Phase 4 enum_trialPhase phase_4 + enum_primaryPurpose + Treatment enum_primaryPurpose treatment + Prevention enum_primaryPurpose prevention + Diagnostic enum_primaryPurpose diagnostic + Supportive Care enum_primaryPurpose supportive_care + Screening enum_primaryPurpose screening + Health Services Research enum_primaryPurpose health_services_research + Basic Science enum_primaryPurpose basic_science + Device Feasibility enum_primaryPurpose device_feasibility + Other enum_primaryPurpose other + enum_interventionType + Drug enum_interventionType drug + Device enum_interventionType device + Biological/Vaccine enum_interventionType biological_vaccine + Procedure/Surgery enum_interventionType procedure_surgery + Radiation enum_interventionType radiation + Behavioral enum_interventionType behavioral + Genetic enum_interventionType genetic + Dietary Supplement enum_interventionType dietary_supplement + Combination Product enum_interventionType combination_product + Diagnostic Test enum_interventionType diagnostic_test + Other enum_interventionType other + enum_armAllocation + Not Applicable enum_armAllocation not_applicable + Randomized enum_armAllocation randomized + Nonrandomized enum_armAllocation nonrandomized + enum_interventionAssignmentStrategy + Single Arm enum_interventionAssignmentStrategy single_arm + Parallel enum_interventionAssignmentStrategy parallel + Crossover enum_interventionAssignmentStrategy crossover + Factorial enum_interventionAssignmentStrategy factorial + Sequential enum_interventionAssignmentStrategy sequential + enum_familyType + Control-only Unrelated control, no Down syndrome family members enum_familyType control_only + Duo Proband + one parent enum_familyType duo + Other Other family structure, eg one parent + twins enum_familyType other + Proband-only Proband only, no family members participating in study enum_familyType proband_only + Trio Proband + two parents enum_familyType trio + Trio Plus Proband + two parents + other relatives enum_familyType trio_plus + enum_familyRelationship + Proband The first affected family member to join the study enum_familyRelationship proband NCIT:C64435 + Father enum_familyRelationship father NCIT:C25174 + Mother enum_familyRelationship mother NCIT:C25189 + Sibling enum_familyRelationship sibling NCIT:C25204 + Other relative enum_familyRelationship other_relative NCIT:C21480 + Unrelated control enum_familyRelationship unrelated_control NCIT:C25328 + enum_sex + Female enum_sex female NCIT:C16576 + Male enum_sex male NCIT:C20197 + Other enum_sex other NCIT:C17649 + Unknown enum_sex unknown NCIT:C17998 + enum_race + American Indian or Alaska Native enum_race american_indian_or_alaska_native NCIT:C41259 + Asian enum_race asian NCIT:C41260 + Black or African American enum_race black_or_african_american NCIT:C16352 + More than one race enum_race more_than_one_race NCIT:C67109 + Native Hawaiian or Other Pacific Islander enum_race native_hawaiian_or_other_pacific_islander NCIT:C41219 + Other enum_race other NCIT:C17649 + White enum_race white NCIT:C41261 + Prefer not to answer enum_race prefer_not_to_answer NCIT:C132222 + Unknown enum_race unknown NCIT:C17998 + East Asian UK only; do not use for US data enum_race east_asian NCIT:C161419 + Latin American UK only; do not use for US data enum_race latin_american NCIT:C126531 + Middle Eastern or North African UK only; do not use for US data enum_race middle_eastern_or_north_african NCIT:C43866 + South Asian UK only; do not use for US data enum_race south_asian NCIT:C41263 + enum_ethnicity + Hispanic or Latino enum_ethnicity hispanic_or_latino NCIT:C17459 + Not Hispanic or Latino enum_ethnicity not_hispanic_or_latino NCIT:C41222 + Prefer not to answer enum_ethnicity prefer_not_to_answer NCIT:C132222 + Unknown enum_ethnicity unknown NCIT:C17998 + enum_downSyndromeStatus + D21 Disomy 21 (euploid) enum_downSyndromeStatus d21 + T21 Trisomy 21 (Down syndrome) enum_downSyndromeStatus t21 MONDO:0008608 + enum_vital_status + Dead enum_vital_status dead NCIT:C28554 + Alive enum_vital_status alive NCIT:C37987 + Unknown or not available enum_vital_status unknown_or_not_available NCIT:C17998 + enum_conditionInterpretation + Observed Condition was observed or reported (this will be the case for most conditions) enum_conditionInterpretation observed + Not Observed Participant was specifically examined or medical record queried for condition and found to be negative enum_conditionInterpretation not_observed + enum_conditionStatus + Current Condition is ongoing enum_conditionStatus current + Resolved Condition has been resolved enum_conditionStatus resolved + History Of This is a general history of the condition, without known dates enum_conditionStatus history_of + enum_conditionDataSource + Clinical Information about condition was obtained from medical records or reported by investigator enum_conditionDataSource clinical + Self-reported Information about condition was reported by participant or family member enum_conditionDataSource self_reported + enum_dataAccess + Controlled enum_dataAccess controlled + Open enum_dataAccess open + Registered enum_dataAccess registered + enum_Availability + Available Sample or Container is potentially available to be requested through the Virtual Biorepository (see VBR contact info in Study page) enum_Availability available + Unavailable Sample or Container either was available through Virtual Biorepository but has been used up, or is part of a study that is not participating in the VBR enum_Availability unavailable + enum_inputOutputClass + Participant enum_inputOutputClass participant + Biospecimen enum_inputOutputClass biospecimen + DataFile enum_inputOutputClass datafile diff --git a/src/data/sheets/slots.tsv b/src/data/sheets/slots.tsv new file mode 100644 index 00000000..27954fb3 --- /dev/null +++ b/src/data/sheets/slots.tsv @@ -0,0 +1,2 @@ +class slot title description range required multivalued enum permissible_value title description meaning +>class slot title description range required multivalued enum permissible_value title description meaning diff --git a/src/data_validation/csv2json.py b/src/data_validation/csv2json.py new file mode 100644 index 00000000..004f7a45 --- /dev/null +++ b/src/data_validation/csv2json.py @@ -0,0 +1,155 @@ +import csv +import json +from pathlib import Path +import yaml +import re + +SCHEMA_PATH = Path("src/linkml/include_schema.yaml").resolve() +INPUT_CSV = Path("src/data/input/study_test_data_utf8.csv").resolve() +OUTPUT_JSON = Path("src/data/input/study_data.json").resolve() + + +def get_multivalued_slots(schema_path): + """Return list of slots that are multivalued in the schema.""" + with open(schema_path, "r", encoding="utf-8") as f: + schema = yaml.safe_load(f) + return [name for name, slot in schema.get("slots", {}).items() if slot.get("multivalued")] + + +def get_enum_slots(schema_path): + """ + Return dict of slots constrained by enums. + Keys = slot names, Values = set of normalized permissible values. + """ + with open(schema_path, "r", encoding="utf-8") as f: + schema = yaml.safe_load(f) + + enum_slots = {} + enums = schema.get("enums", {}) + + for name, slot in schema.get("slots", {}).items(): + enum_name = None + + if "enum" in slot: + enum_name = slot["enum"] + elif "values" in slot: + enum_slots[name] = {normalize_value(v) for v in slot["values"]} + continue + elif slot.get("range") and str(slot["range"]).startswith("enum_"): + enum_name = slot["range"] + + if enum_name and enum_name in enums: + permissible = enums[enum_name].get("permissible_values", {}) + enum_slots[name] = {normalize_value(v) for v in permissible.keys()} + + return enum_slots + + +def get_slot_types(schema_path): + """Return dict of slot → range (datatype or enum).""" + with open(schema_path, "r", encoding="utf-8") as f: + schema = yaml.safe_load(f) + return {name: slot.get("range", "string") for name, slot in schema.get("slots", {}).items()} + + +def normalize_value(value: str) -> str: + """Normalize values to snake_case (lowercase with underscores).""" + if not value: + return value + value = value.strip().lower() + value = re.sub(r"[ \-/]+", "_", value) # replace spaces, slashes, hyphens + value = re.sub(r"_+", "_", value) # collapse multiple underscores + return value + + +def cast_value(value, datatype): + """Cast CSV string value to correct datatype based on schema range.""" + if value is None or value == "": + return None + + if datatype == "integer": + try: + return int(value) + except ValueError: + print(f"⚠️ Warning: Cannot cast '{value}' to integer, leaving as string") + return value + elif datatype in {"float", "double", "number"}: + try: + return float(value) + except ValueError: + print(f"⚠️ Warning: Cannot cast '{value}' to float, leaving as string") + return value + elif datatype == "boolean": + val = value.strip().lower() + if val in {"true", "1", "yes"}: + return True + elif val in {"false", "0", "no"}: + return False + else: + print(f"⚠️ Warning: Cannot cast '{value}' to boolean, leaving as string") + return value + else: + return value # default: keep as string + + +def validate_enum(slot, value, enum_slots): + """Check if value is valid for the slot's enum. Warn if not.""" + permissible = enum_slots.get(slot, set()) + if permissible and value not in permissible: + print(f"⚠️ Warning: Value '{value}' not in permissible values for slot '{slot}'") + return value + + +def preprocess_csv_to_json(input_csv, array_slots, enum_slots, slot_types, delimiter="|"): + """Convert CSV to JSON with type casting, enum normalization, and array handling.""" + records = [] + with input_csv.open(newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + for slot in row: + datatype = slot_types.get(slot, "string") + value = row[slot].strip() if row[slot] is not None else "" + + if slot in array_slots: + if value == "": + row[slot] = [] # empty string → empty list + else: + values = value.split(delimiter) + if slot in enum_slots: + row[slot] = [validate_enum(slot, normalize_value(v), enum_slots) for v in values] + else: + row[slot] = [cast_value(v.strip(), datatype) for v in values] + else: + if value == "": + row[slot] = None # empty string → null + elif slot in enum_slots: + row[slot] = validate_enum(slot, normalize_value(value), enum_slots) + else: + row[slot] = cast_value(value, datatype) + records.append(row) + return records + + +def main(): + multivalued_slots = get_multivalued_slots(SCHEMA_PATH) + enum_slots = get_enum_slots(SCHEMA_PATH) + slot_types = get_slot_types(SCHEMA_PATH) + + # Add known array slots that might not have multivalued: true in schema + extra_array_slots = ["publication", "acknowledgments", "citationStatement"] + array_slots = list(set(multivalued_slots + extra_array_slots)) + + print(f"Array slots detected: {array_slots}") + print(f"Enum slots detected: {list(enum_slots.keys())}") + print(f"Slot types detected: {slot_types}") + + data = preprocess_csv_to_json(INPUT_CSV, array_slots, enum_slots, slot_types) + + with OUTPUT_JSON.open("w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + print(f"✅ Preprocessed JSON written to {OUTPUT_JSON}") + + +if __name__ == "__main__": + main() diff --git a/src/linkml/include_schema.yaml b/src/linkml/include_schema.yaml index b49ab537..02456d1e 100644 --- a/src/linkml/include_schema.yaml +++ b/src/linkml/include_schema.yaml @@ -68,6 +68,28 @@ classes: description: URL for publication(s) describing the study's rationale and methodology (PubMed Central preferred but not required; pipe-separated if multiple) expectedNumberOfParticipants: description: Expected number of participants in this study (or actual number, if data has been submitted to INCLUDE DCC). If additional explanation is needed, please add to Study Description field. + ClinicalTrial: + definition_uri: include:ClinicalTrial + description: Information specific to clinical trials + title: Clinical Trial + slots: + - studyCode + - nctId + - clinicalStudyDesign + - trialPhase + - primaryPurpose + - interventionType + - intervention + - armInformation + - armAllocation + - interventionAssignmentStrategy + - primaryOutcomeMeasure + - secondaryOutcomeMeasure + - otherOutcomeMeasure + slot_usage: + # Reference Slots + studyCode: + range: Study Dataset: definition_uri: include:Dataset description: Information about a specific grouping of data files @@ -134,23 +156,26 @@ classes: range: DataFile datasetGlobalId: range: Dataset - ParticipantDataFileManifest: - definition_uri: include:ParticipantDataFileManifest - description: List of Participants in DataFiles with multiple Participants - title: Participant-DataFile Manifest + ParticipantSampleDataFileManifest: + definition_uri: include:ParticipantSampleDataFileManifest + description: List of Participants and/or Samples in DataFiles with multiple Participants and/or Samples + title: Participant-Sample-DataFile Manifest slots: - studyCode - participantExternalId + - sampleExternalId - fileName #TODO: add Dictonary File Name & Global ID slot_usage: # Reference Slots studyCode: range: Study - fileGlobalId: + fileName: range: DataFile - datasetGlobalId: - range: Dataset + participantExternalId: + range: Participant + sampleExternalId: + range: Biospecimen Participant: definition_uri: include:Participant description: Demographic and clinical information about the participant @@ -174,12 +199,6 @@ classes: - firstParticipantEngagementEvent - outcomesVitalStatus - ageAtLastVitalStatus - slot_usage: - studyCode: - multivalued: true - range: Study # Reference Slots - participantExternalId: - multivalued: true Condition: definition_uri: include:Condition description: Co-occurring conditions and other observations for the participant @@ -404,7 +423,8 @@ slots: multivalued: true selectionCriteria: definition_uri: include:selectionCriteria - description: Brief description of inclusion and/or exclusion criteria for the study + aliases: clinicaltrials.gov Eligibility Criteria + description: A limited list of criteria for selection of participants in the study, provided in terms of inclusion and exclusion criteria. For Observational studies, a description of the population from which the groups or cohorts were selected (for example, primary care clinic, community sample, residents of a certain town). title: Selection Criteria range: string studyDesign: @@ -427,6 +447,7 @@ slots: description: Website for the study title: Study Website range: uri + multivalued: true publication: definition_uri: include:publication title: Publication @@ -480,6 +501,102 @@ slots: title: DOI Citation range: string required: false #change to true after pilot + nctId: + definition_uri: include:nctId + title: NCT ID + aliases: clinicaltrials.gov NCT Number + description: The unique identification code given to each clinical study upon registration at ClinicalTrials.gov. The format is "NCT" followed by an 8-digit number. Also known as ClinicalTrials.gov Identifier + range: string + required: true + multivalued: false + clinicalStudyDesign: + definition_uri: include:clinicalStudyDesign + title: Clinical Study Design + aliases: clinicaltrials.gov Study Type + description: The nature of the investigation or investigational use for which clinical study information is being submitted. Select one. + range: enum_clinicalStudyDesign + required: true + multivalued: false + trialPhase: + definition_uri: include:trialPhase + title: Trial Phase + aliases: clinicaltrials.gov Study Phase + description: For a clinical trial of a drug product (including a biological product), the numerical phase of such clinical trial. Select only one. + range: enum_trialPhase + required: true + multivalued: false + primaryPurpose: + definition_uri: include:primaryPurpose + title: Primary Purpose + aliases: clinicaltrials.gov Primary Purpose + description: The main objective of the intervention(s) being evaluated by the clinical trial. Select one. + range: enum_primaryPurpose + required: true + multivalued: false + interventionType: + definition_uri: include:interventionType + title: Intervention Type + aliases: clinicaltrials.gov Intervention Type + description: For each intervention studied in the clinical study, the general type of intervention. Select one. + range: enum_interventionType + required: true + multivalued: false + intervention: + definition_uri: include:intervention + title: Intervention + aliases: clinicaltrials.gov Intervention + description: For interventional studies, specify the intervention(s) associated with each arm or group; at least one intervention must be specified for interventional studies. Use non-proprietary names where available. Multiple values should be pipe-separated. For observational studies, specify the intervention(s)/exposure(s) of interest, if any. + range: string + required: true + multivalued: true + armInformation: + definition_uri: include:armInformation + title: Arm Information + aliases: clinicaltrials.gov Study Arms + description: For interventional studies, a description of each arm of the clinical trial that indicates its role in the clinical trial (e.g. Experimental, Active Comparator, Placebo Comparator, Sham Comparator; No Intervention; Other); provides an informative title; and, if necessary, additional descriptive information (including which interventions are administered in each arm) to differentiate each arm from other arms in the clinical trial. Multiple values should be pipe-separated. For observational studies, specify the predefined participant groups (cohorts) to be studied, e.g. those with or without a condition/exposure. + range: string + required: true + multivalued: true + armAllocation: + definition_uri: include:armAllocation + title: Arm Allocation + aliases: clinicaltrials.gov Allocation + description: The method by which participants are assigned to arms in a clinical trial. + range: enum_armAllocation + required: true + multivalued: false + interventionAssignmentStrategy: + definition_uri: include:interventionAssignmentStrategy + title: Intervention Assignment Strategy + aliases: clinicaltrials.gov Interventional Study Model + description: In an interventional study, the strategy for assigning interventions to participants. + range: enum_interventionAssignmentStrategy + required: true + multivalued: false + primaryOutcomeMeasure: + definition_uri: include:primaryOutcomeMeasure + title: Primary Outcome Measure + aliases: clinicaltrials.gov Primary Outcome Measures + description: A description of each primary outcome measure (title, description, and time point/duration of assessment). Multiple values should be pipe-separated. + range: string + required: true + multivalued: true + secondaryOutcomeMeasure: + definition_uri: include:secondaryOutcomeMeasure + title: Secondary Outcome Measure + aliases: clinicaltrials.gov Secondary Outcome Measures + description: A description of each secondary outcome measure (title, description, and time point/duration of assessment). Multiple values should be pipe-separated. + range: string + required: true + multivalued: true + otherOutcomeMeasure: + definition_uri: include:otherOutcomeMeasure + title: Other Outcome Measure + aliases: clinicaltrials.gov Other Outcome Measures + description: A description of other outcome measures (title, description, and time point/duration of assessment). Multiple values should be pipe-separated. + range: string + required: true + multivalued: true datasetName: definition_uri: include:datasetName description: Full name of the dataset, provided by contributor @@ -539,6 +656,7 @@ slots: description: URL if dataset is already deposited in a public repository other than dbGaP (e.g. LONI, Metabolomics Workbench, etc.) title: Other Repository range: uri + multivalued: true otherAccessAuthority: definition_uri: include:otherAccessAuthority description: Email or URL for dataset's Access Authority, if not dbGaP @@ -941,15 +1059,6 @@ slots: range: string # eventually want ontology terms here enums: - enum_program: - definition_uri: include:enum_program - permissible_values: - include: - title: INCLUDE - kf: - title: KF - other: - title: Other enum_studyCode: definition_uri: include:enum_studyCode permissible_values: @@ -961,6 +1070,10 @@ enums: title: ADS aecom_ds: title: AECOM-DS + apap21: + title: APAP21 + arc_ds: + title: ARC-DS best21: title: BEST21 brainpower: @@ -975,8 +1088,6 @@ enums: title: CHARGE-DS decidas: title: DECIDAS - ds_arc: - title: DS-ARC ds_brain: title: DS-Brain ds_cog_all: @@ -987,6 +1098,8 @@ enums: title: DS-DETERMINED ds_home: title: DS-HOME + ds_hpbm: + title: DS-HPBM ds_hsat: title: DS-HSAT ds_isp: @@ -1005,19 +1118,22 @@ enums: title: DS360-CHD dsc: title: DSC - dspostbfmulti: - title: DSpostBFmulti dsrrs: title: DSRRS + each_ds: + title: EACH-DS ecods: title: ECODS exceeds: title: EXcEEDS - HTP: + htp: + title: HTP ibis-ds: title: IBIS-DS jaki_ds: title: JAKi-DS + mosaic_ds: + title: MOSAIC-DS optimal: title: OPTimal team_ds: @@ -1030,6 +1146,15 @@ enums: title: X01-deSmith x01_hakonarson: title: X01-Hakonarson + enum_program: + definition_uri: include:enum_program + permissible_values: + include: + title: INCLUDE + kf: + title: KF + other: + title: Other enum_researchDomain: #TODO: replace/add NIH internal categories from Huiqing definition_uri: include:enum_researchDomain @@ -1049,6 +1174,9 @@ enums: neurodevelopment: title: Neurodevelopment meaning: mesh:D065886 + nutritional_and_metabolic_diseases: + title: Nutritional and Metabolic Diseases + meaning: mesh:D009750 sleep_wake_disorders: title: Sleep Wake Disorders meaning: mesh:D012893 @@ -1058,6 +1186,9 @@ enums: physical_fitness: title: Physical Fitness meaning: mesh:D010809 + respiratory_tract_diseases: + title: Respiratory Tract Diseases + meaning: mesh:D012140 other: title: Other enum_participantLifespanStage: @@ -1093,6 +1224,8 @@ enums: title: Interventional longitudinal: title: Longitudinal + technology_development: + title: Technology Development trial_readiness_study: title: Trial Readiness Study tumor_vs_matched_normal: @@ -1109,7 +1242,10 @@ enums: description: Data obtained by examination, interview, etc. with investigator participant_or_caregiver_report: title: Participant or Caregiver Report - description: Data obtained from survey, questionnaire, etc. filled out by participant or caregiver + description: Data obtained from survey, questionnaire, app, etc. filled out by participant or caregiver + wearable: + title: Wearable + description: Data collected by wearable or other device other: title: Other description: Data obtained from other source, such as tissue bank @@ -1132,6 +1268,8 @@ enums: title: Proteomics metabolomics: title: Metabolomics + adherence: + title: Adherence cognitive_behavioral: title: Cognitive/Behavioral immune_profiling: @@ -1147,7 +1285,7 @@ enums: other: title: Other sleep_study: - title: Sleep Study + title: Sleep enum_guidType: definition_uri: include:enum_guidType permissible_values: @@ -1157,64 +1295,128 @@ enums: other: title: Other description: GUID generated by other system - No GUID: + no_guid: + title: No GUID description: No GUIDs used in this study - enum_conditionInterpretation: + enum_clinicalStudyDesign: + definition_uri: include:enum_clinicalStudyDesign permissible_values: - observed: - title: Observed - description: Condition was observed or reported (this will be the case for most conditions) - not_observed: - title: Not Observed - description: Participant was specifically examined or medical record queried for condition and found to be negative - enum_conditionDataSource: #replace with enum_clinicalDataSourceType & re-harmonize data - name: enum_conditionDataSource + interventional: + title: Interventional + observational: + title: Observational + patient_registry: + title: Patient Registry + expanded_access: + title: Expanded Access + enum_trialPhase: + definition_uri: include:enum_trialPhase permissible_values: - clinical: - title: Clinical - description: Information about condition was obtained from medical records or reported by investigator - self_reported: - title: Self-reported - description: Information about condition was reported by participant or family member - enum_conditionStatus: + not_applicable: + title: Not Applicable + early_phase_1: + title: Early Phase 1 + phase_1: + title: Phase 1 + phase_1_2: + title: Phase 1/2 + phase_2: + title: Phase 2 + phase_2_3: + title: Phase 2/3 + phase_3: + title: Phase 3 + phase_4: + title: Phase 4 + enum_primaryPurpose: + definition_uri: include:enum_clinicalStudyDesign permissible_values: - current: - title: Current - description: Condition is ongoing - resolved: - title: Resolved - description: Condition has been resolved - history_of: - title: History Of - description: This is a general history of the condition, without known dates - enum_downSyndromeStatus: - definition_uri: include:enum_downSyndromeStatus + treatment: + title: Treatment + prevention: + title: Prevention + diagnostic: + title: Diagnostic + supportive_care: + title: Supportive Care + screening: + title: Screening + health_services_research: + title: Health Services Research + basic_science: + title: Basic Science + device_feasibility: + title: Device Feasibility + other: + title: Other + enum_interventionType: + definition_uri: include:enum_interventionType permissible_values: - d21: - title: D21 - description: Disomy 21 (euploid) - t21: - title: T21 - description: Trisomy 21 (Down syndrome) - meaning: MONDO:0008608 - enum_ethnicity: - definition_uri: include:enum_ethnicity + drug: + title: Drug + device: + title: Device + biological_vaccine: + title: Biological/Vaccine + procedure_surgery: + title: Procedure/Surgery + radiation: + title: Radiation + behavioral: + title: Behavioral + genetic: + title: Genetic + dietary_supplement: + title: Dietary Supplement + combination_product: + title: Combination Product + diagnostic_test: + title: Diagnostic Test + other: + title: Other + enum_armAllocation: + definition_uri: include:enum_armAllocation permissible_values: - # asked_but_unknown: - # text: asked_but_unknown - # title: Asked but unknown - hispanic_or_latino: - title: Hispanic or Latino - meaning: NCIT:C17459 - not_hispanic_or_latino: - title: Not Hispanic or Latino - meaning: NCIT:C41222 - prefer_not_to_answer: - title: Prefer not to answer - meaning: NCIT:C132222 - unknown: - title: Unknown - meaning: NCIT:C17998 + not_applicable: + title: Not Applicable + randomized: + title: Randomized + nonrandomized: + title: Nonrandomized + enum_interventionAssignmentStrategy: + definition_uri: include:enum_interventionAssignmentStrategy + permissible_values: + single_arm: + title: Single Group + parallel: + title: Parallel + crossover: + title: Crossover + factorial: + title: Factorial + sequential: + title: Sequential + enum_familyType: + definition_uri: include:enum_familyType + permissible_values: + control_only: + title: Control-only + description: Unrelated control, no Down syndrome family members + duo: + title: Duo + description: Proband + one parent + other: + title: Other + description: Other family structure, eg one parent + twins + proband_only: + title: Proband-only + description: Proband only, no family members participating in study + trio: + title: Trio + description: Proband + two parents + trio_plus: + title: Trio Plus #need to reharmonize data + description: Proband + two parents + other relatives enum_familyRelationship: definition_uri: include:enum_familyRelationship permissible_values: @@ -1237,27 +1439,21 @@ enums: unrelated_control: title: Unrelated control meaning: NCIT:C25328 - enum_familyType: - definition_uri: include:enum_familyType + enum_sex: + definition_uri: include:enum_sex permissible_values: - control_only: - title: Control-only - description: Unrelated control, no Down syndrome family members - duo: - title: Duo - description: Proband + one parent + female: + title: Female + meaning: NCIT:C16576 + male: + title: Male + meaning: NCIT:C20197 other: title: Other - description: Other family structure, eg one parent + twins - proband_only: - title: Proband-only - description: Proband only, no family members participating in study - trio: - title: Trio - description: Proband + two parents - trio_plus: - title: Trio Plus #need to reharmonize data - description: Proband + two parents + other relatives + meaning: NCIT:C17649 + unknown: + title: Unknown + meaning: NCIT:C17998 enum_race: definition_uri: include:enum_race permissible_values: @@ -1304,21 +1500,34 @@ enums: title: South Asian description: UK only; do not use for US data meaning: NCIT:C41263 - enum_sex: - definition_uri: include:enum_sex + enum_ethnicity: + definition_uri: include:enum_ethnicity permissible_values: - female: - title: Female - meaning: NCIT:C16576 - male: - title: Male - meaning: NCIT:C20197 - other: - title: Other - meaning: NCIT:C17649 + # asked_but_unknown: + # text: asked_but_unknown + # title: Asked but unknown + hispanic_or_latino: + title: Hispanic or Latino + meaning: NCIT:C17459 + not_hispanic_or_latino: + title: Not Hispanic or Latino + meaning: NCIT:C41222 + prefer_not_to_answer: + title: Prefer not to answer + meaning: NCIT:C132222 unknown: title: Unknown meaning: NCIT:C17998 + enum_downSyndromeStatus: + definition_uri: include:enum_downSyndromeStatus + permissible_values: + d21: + title: D21 + description: Disomy 21 (euploid) + t21: + title: T21 + description: Trisomy 21 (Down syndrome) + meaning: MONDO:0008608 enum_vital_status: definition_uri: include:vital_status permissible_values: @@ -1331,6 +1540,34 @@ enums: unknown_or_not_available: title: Unknown or not available meaning: NCIT:C17998 + enum_conditionInterpretation: + permissible_values: + observed: + title: Observed + description: Condition was observed or reported (this will be the case for most conditions) + not_observed: + title: Not Observed + description: Participant was specifically examined or medical record queried for condition and found to be negative + enum_conditionStatus: + permissible_values: + current: + title: Current + description: Condition is ongoing + resolved: + title: Resolved + description: Condition has been resolved + history_of: + title: History Of + description: This is a general history of the condition, without known dates + enum_conditionDataSource: #replace with enum_clinicalDataSourceType & re-harmonize data + name: enum_conditionDataSource + permissible_values: + clinical: + title: Clinical + description: Information about condition was obtained from medical records or reported by investigator + self_reported: + title: Self-reported + description: Information about condition was reported by participant or family member enum_dataAccess: definition_uri: include:enum_dataAccess permissible_values: