From 40a7aa2677e9c60092e8663f2f9fb5a2f831da2e Mon Sep 17 00:00:00 2001 From: Jennifer Polsgrove Date: Sun, 7 Dec 2025 20:28:45 -0600 Subject: [PATCH 1/3] Added COVID-Red dataset --- examples/covidred_example.py | 385 ++++++++++++++++++++++++++ pyhealth/datasets/__init__.py | 1 + pyhealth/datasets/covidred_dataset.py | 312 +++++++++++++++++++++ pyhealth/tasks/__init__.py | 5 + pyhealth/tasks/covidred_tasks.py | 201 ++++++++++++++ 5 files changed, 904 insertions(+) create mode 100644 examples/covidred_example.py create mode 100644 pyhealth/datasets/covidred_dataset.py create mode 100644 pyhealth/tasks/covidred_tasks.py diff --git a/examples/covidred_example.py b/examples/covidred_example.py new file mode 100644 index 000000000..ccc56ef83 --- /dev/null +++ b/examples/covidred_example.py @@ -0,0 +1,385 @@ +""" +COVID-RED Dataset Example for PyHealth + +This script demonstrates how to: +1. Load the COVID-RED wearable device dataset +2. Define a COVID-19 detection task +3. Train a simple LSTM classifier for early COVID-19 detection + +Dataset: Remote Early Detection of SARS-CoV-2 infections (COVID-RED) +Source: https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7 +""" + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader +import numpy as np +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score + +# Import PyHealth components (adjust imports based on actual PyHealth structure) +try: + from pyhealth.datasets import COVIDREDDataset + from pyhealth.tasks import covidred_detection_fn, covidred_prediction_fn +except ImportError: + # For standalone testing, import from local files + import sys + sys.path.insert(0, '.') + from covidred_dataset import COVIDREDDataset + from covidred_tasks import covidred_detection_fn, covidred_prediction_fn + + +class LSTMClassifier(nn.Module): + """ + Simple LSTM classifier for COVID-19 detection from time series data. + + This model processes multivariate time series of wearable device measurements + (heart rate, steps, sleep) to predict COVID-19 infection. + """ + + def __init__(self, input_size, hidden_size=64, num_layers=2, num_classes=2, dropout=0.3): + """ + Parameters + ---------- + input_size : int + Number of features per time step (e.g., 8 for COVID-RED) + hidden_size : int + Number of LSTM hidden units + num_layers : int + Number of LSTM layers + num_classes : int + Number of output classes (2 for binary classification) + dropout : float + Dropout probability + """ + super(LSTMClassifier, self).__init__() + + self.hidden_size = hidden_size + self.num_layers = num_layers + + # LSTM layer + self.lstm = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout if num_layers > 1 else 0, + bidirectional=True + ) + + # Fully connected layers + self.fc1 = nn.Linear(hidden_size * 2, hidden_size) # *2 for bidirectional + self.relu = nn.ReLU() + self.dropout = nn.Dropout(dropout) + self.fc2 = nn.Linear(hidden_size, num_classes) + + def forward(self, x): + """ + Forward pass. + + Parameters + ---------- + x : torch.Tensor + Input tensor of shape (batch_size, n_features, seq_len) + + Returns + ------- + torch.Tensor + Output logits of shape (batch_size, num_classes) + """ + # Transpose to (batch_size, seq_len, n_features) for LSTM + x = x.transpose(1, 2) + + # LSTM forward pass + lstm_out, _ = self.lstm(x) + + # Take the output from the last time step + last_output = lstm_out[:, -1, :] + + # Fully connected layers + out = self.fc1(last_output) + out = self.relu(out) + out = self.dropout(out) + out = self.fc2(out) + + return out + + +def collate_fn(batch): + """ + Custom collate function to batch samples from COVIDREDDataset. + + Parameters + ---------- + batch : list + List of sample dictionaries from the dataset. + + Returns + ------- + dict + Batched data with stacked tensors. + """ + signals = torch.stack([item["signal"] for item in batch]) + labels = torch.tensor([item["label"] for item in batch], dtype=torch.long) + patient_ids = [item["patient_id"] for item in batch] + visit_ids = [item["visit_id"] for item in batch] + + return { + "signal": signals, + "label": labels, + "patient_id": patient_ids, + "visit_id": visit_ids, + } + + +def train_epoch(model, dataloader, criterion, optimizer, device): + """Train the model for one epoch.""" + model.train() + total_loss = 0 + all_labels = [] + all_predictions = [] + + for batch in dataloader: + signals = batch["signal"].to(device) + labels = batch["label"].to(device) + + # Forward pass + optimizer.zero_grad() + outputs = model(signals) + loss = criterion(outputs, labels) + + # Backward pass + loss.backward() + optimizer.step() + + total_loss += loss.item() + + # Get predictions + _, predicted = torch.max(outputs, 1) + all_labels.extend(labels.cpu().numpy()) + all_predictions.extend(predicted.cpu().numpy()) + + avg_loss = total_loss / len(dataloader) + accuracy = accuracy_score(all_labels, all_predictions) + + return avg_loss, accuracy + + +def evaluate(model, dataloader, criterion, device): + """Evaluate the model.""" + model.eval() + total_loss = 0 + all_labels = [] + all_predictions = [] + all_probabilities = [] + + with torch.no_grad(): + for batch in dataloader: + signals = batch["signal"].to(device) + labels = batch["label"].to(device) + + # Forward pass + outputs = model(signals) + loss = criterion(outputs, labels) + + total_loss += loss.item() + + # Get predictions and probabilities + probabilities = torch.softmax(outputs, dim=1) + _, predicted = torch.max(outputs, 1) + + all_labels.extend(labels.cpu().numpy()) + all_predictions.extend(predicted.cpu().numpy()) + all_probabilities.extend(probabilities[:, 1].cpu().numpy()) # Probability of positive class + + avg_loss = total_loss / len(dataloader) + + # Calculate metrics + accuracy = accuracy_score(all_labels, all_predictions) + precision = precision_score(all_labels, all_predictions, zero_division=0) + recall = recall_score(all_labels, all_predictions, zero_division=0) + f1 = f1_score(all_labels, all_predictions, zero_division=0) + + # Calculate AUC if there are both positive and negative samples + if len(set(all_labels)) > 1: + auc = roc_auc_score(all_labels, all_probabilities) + else: + auc = 0.0 + + return avg_loss, accuracy, precision, recall, f1, auc + + +def main(): + """Main function to demonstrate COVID-RED dataset usage.""" + + # Set random seeds for reproducibility + torch.manual_seed(42) + np.random.seed(42) + + # Configuration + DATA_ROOT = "/path/to/covidred" # Update this path + WINDOW_DAYS = 7 + TASK_TYPE = "prediction" # "detection" or "prediction" + BATCH_SIZE = 32 + NUM_EPOCHS = 50 + LEARNING_RATE = 0.001 + DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + print("=" * 80) + print("COVID-RED Dataset Example for PyHealth") + print("=" * 80) + + # Load datasets + print("\n[1/5] Loading COVID-RED dataset...") + print(f" - Root directory: {DATA_ROOT}") + print(f" - Window size: {WINDOW_DAYS} days") + print(f" - Task type: {TASK_TYPE}") + + try: + train_dataset = COVIDREDDataset( + root=DATA_ROOT, + split="train", + window_days=WINDOW_DAYS, + task=TASK_TYPE, + ) + + test_dataset = COVIDREDDataset( + root=DATA_ROOT, + split="test", + window_days=WINDOW_DAYS, + task=TASK_TYPE, + ) + + print(f"\n Dataset loaded successfully!") + print(f" - Training samples: {len(train_dataset)}") + print(f" - Test samples: {len(test_dataset)}") + + # Show label distribution + train_dist = train_dataset.get_label_distribution() + test_dist = test_dataset.get_label_distribution() + + print(f"\n Training set distribution:") + print(f" - Positive: {train_dist['positive_samples']} ({train_dist['positive_ratio']:.2%})") + print(f" - Negative: {train_dist['negative_samples']}") + + print(f"\n Test set distribution:") + print(f" - Positive: {test_dist['positive_samples']} ({test_dist['positive_ratio']:.2%})") + print(f" - Negative: {test_dist['negative_samples']}") + + except FileNotFoundError as e: + print(f"\n ERROR: {e}") + print("\n Please download the COVID-RED dataset from:") + print(" https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7") + return + + # Create data loaders + print("\n[2/5] Creating data loaders...") + + # Apply task function to samples + task_fn = covidred_prediction_fn if TASK_TYPE == "prediction" else covidred_detection_fn + + # Wrap samples with task function + class TaskDataset(torch.utils.data.Dataset): + def __init__(self, base_dataset, task_fn): + self.base_dataset = base_dataset + self.task_fn = task_fn + + def __len__(self): + return len(self.base_dataset) + + def __getitem__(self, idx): + sample = self.base_dataset[idx] + return self.task_fn(sample) + + train_task_dataset = TaskDataset(train_dataset, task_fn) + test_task_dataset = TaskDataset(test_dataset, task_fn) + + train_loader = DataLoader( + train_task_dataset, + batch_size=BATCH_SIZE, + shuffle=True, + collate_fn=collate_fn, + ) + + test_loader = DataLoader( + test_task_dataset, + batch_size=BATCH_SIZE, + shuffle=False, + collate_fn=collate_fn, + ) + + print(f" - Batch size: {BATCH_SIZE}") + print(f" - Training batches: {len(train_loader)}") + print(f" - Test batches: {len(test_loader)}") + + # Initialize model + print("\n[3/5] Initializing LSTM model...") + + # Get feature dimension from first sample + sample = train_dataset[0] + input_size = len(train_dataset.get_feature_names()) + + model = LSTMClassifier( + input_size=input_size, + hidden_size=64, + num_layers=2, + num_classes=2, + dropout=0.3, + ).to(DEVICE) + + print(f" - Input features: {input_size}") + print(f" - Model parameters: {sum(p.numel() for p in model.parameters()):,}") + print(f" - Device: {DEVICE}") + + # Loss and optimizer + # Use weighted loss for imbalanced datasets + pos_weight = train_dist['negative_samples'] / max(train_dist['positive_samples'], 1) + criterion = nn.CrossEntropyLoss(weight=torch.tensor([1.0, pos_weight]).to(DEVICE)) + optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) + + # Training + print("\n[4/5] Training model...") + print(f" - Epochs: {NUM_EPOCHS}") + print(f" - Learning rate: {LEARNING_RATE}") + print(f" - Class weight (positive): {pos_weight:.2f}") + + best_f1 = 0.0 + + for epoch in range(NUM_EPOCHS): + train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, DEVICE) + test_loss, test_acc, test_prec, test_rec, test_f1, test_auc = evaluate( + model, test_loader, criterion, DEVICE + ) + + # Save best model + if test_f1 > best_f1: + best_f1 = test_f1 + torch.save(model.state_dict(), "best_covidred_model.pt") + + if (epoch + 1) % 10 == 0: + print(f"\n Epoch [{epoch+1}/{NUM_EPOCHS}]") + print(f" Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}") + print(f" Test - Loss: {test_loss:.4f}, Acc: {test_acc:.4f}, " + f"F1: {test_f1:.4f}, AUC: {test_auc:.4f}") + + # Final evaluation + print("\n[5/5] Final evaluation on test set...") + model.load_state_dict(torch.load("best_covidred_model.pt")) + test_loss, test_acc, test_prec, test_rec, test_f1, test_auc = evaluate( + model, test_loader, criterion, DEVICE + ) + + print(f"\n Final Test Metrics:") + print(f" - Accuracy: {test_acc:.4f}") + print(f" - Precision: {test_prec:.4f}") + print(f" - Recall: {test_rec:.4f}") + print(f" - F1-Score: {test_f1:.4f}") + print(f" - AUC: {test_auc:.4f}") + + print("\n" + "=" * 80) + print("Training complete! Best model saved to 'best_covidred_model.pt'") + print("=" * 80) + + +if __name__ == "__main__": + main() diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py index 7d6a65f16..1b2090f46 100644 --- a/pyhealth/datasets/__init__.py +++ b/pyhealth/datasets/__init__.py @@ -67,6 +67,7 @@ def __init__(self, *args, **kwargs): from .bmd_hs import BMDHSDataset from .support2 import Support2Dataset from .tcga_prad import TCGAPRADDataset +from .covidred import COVIDREDDataset from .splitter import ( split_by_patient, split_by_patient_conformal, diff --git a/pyhealth/datasets/covidred_dataset.py b/pyhealth/datasets/covidred_dataset.py new file mode 100644 index 000000000..be85486d1 --- /dev/null +++ b/pyhealth/datasets/covidred_dataset.py @@ -0,0 +1,312 @@ +""" +COVID-RED Dataset Loader for PyHealth + +This module implements a dataset loader for the COVID-RED (Remote Early Detection +of SARS-CoV-2 infections) dataset from Utrecht University. + +Dataset: https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7 +""" + +import os +import pandas as pd +import torch +from torch.utils.data import Dataset +from typing import Optional, Callable, Literal + + +class COVIDREDDataset(Dataset): + """ + COVID-RED Dataset for early detection of COVID-19 from wearable device data. + + The COVID-RED dataset contains wearable device measurements (heart rate, steps, sleep) + from participants during the COVID-19 pandemic, collected to enable early detection + of SARS-CoV-2 infections before symptom onset. + + Parameters + ---------- + root : str + Root directory containing the COVID-RED dataset files. + Expected files: + - heart_rate.csv (daily heart rate measurements) + - steps.csv (daily step counts) + - sleep.csv (daily sleep duration) + - labels.csv (COVID-19 test results and symptom dates) + + split : Literal["train", "test", "all"], default="train" + Which split of the data to use. + - "train": Training set (70% of participants) + - "test": Test set (30% of participants) + - "all": All data + + window_days : int, default=7 + Number of days to include in each sample window. + + task : Literal["detection", "prediction"], default="detection" + Task type: + - "detection": Classify COVID-19 positive vs negative during illness period + - "prediction": Predict COVID-19 onset before symptom onset (early detection) + + transform : Optional[Callable], default=None + Optional transform to be applied on a sample. + + random_seed : int, default=42 + Random seed for train/test split reproducibility. + + Attributes + ---------- + samples : list + List of sample dictionaries containing features and labels. + + feature_names : list + Names of features included in each sample. + + Examples + -------- + >>> from pyhealth.datasets import COVIDREDDataset + >>> dataset = COVIDREDDataset( + ... root="/path/to/covidred", + ... split="train", + ... window_days=7, + ... task="prediction" + ... ) + >>> print(f"Dataset size: {len(dataset)}") + >>> sample = dataset[0] + >>> print(f"Features shape: {sample['features'].shape}") + >>> print(f"Label: {sample['label']}") + + Notes + ----- + The dataset must be manually downloaded from: + https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7 + + Citation: + Olthof, A.W., Schut, A., van Beijnum, B.F. et al. (2021). + Remote Early Detection of SARS-CoV-2 infections (COVID-RED). + DataverseNL. https://doi.org/10.34894/FW9PO7 + """ + + def __init__( + self, + root: str, + split: Literal["train", "test", "all"] = "train", + window_days: int = 7, + task: Literal["detection", "prediction"] = "detection", + transform: Optional[Callable] = None, + random_seed: int = 42, + ): + self.root = root + self.split = split + self.window_days = window_days + self.task = task + self.transform = transform + self.random_seed = random_seed + + # Feature names for each data type + self.feature_names = [ + "resting_hr_mean", + "resting_hr_std", + "resting_hr_min", + "resting_hr_max", + "steps_total", + "steps_mean_hourly", + "sleep_duration_hours", + "sleep_efficiency", + ] + + # Load and process the dataset + self._load_data() + self._create_samples() + + def _load_data(self): + """Load CSV files from the dataset directory.""" + # Check if required files exist + required_files = ["heart_rate.csv", "steps.csv", "sleep.csv", "labels.csv"] + for file in required_files: + file_path = os.path.join(self.root, file) + if not os.path.exists(file_path): + raise FileNotFoundError( + f"Required file '{file}' not found in {self.root}. " + f"Please download the COVID-RED dataset from: " + f"https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7" + ) + + # Load data files + self.heart_rate_df = pd.read_csv(os.path.join(self.root, "heart_rate.csv")) + self.steps_df = pd.read_csv(os.path.join(self.root, "steps.csv")) + self.sleep_df = pd.read_csv(os.path.join(self.root, "sleep.csv")) + self.labels_df = pd.read_csv(os.path.join(self.root, "labels.csv")) + + # Convert date columns to datetime + for df in [self.heart_rate_df, self.steps_df, self.sleep_df, self.labels_df]: + if "date" in df.columns: + df["date"] = pd.to_datetime(df["date"]) + + def _create_samples(self): + """Create samples with sliding windows.""" + self.samples = [] + + # Get unique participants + participants = self.labels_df["participant_id"].unique() + + # Split participants into train/test + import numpy as np + np.random.seed(self.random_seed) + n_train = int(len(participants) * 0.7) + shuffled_participants = np.random.permutation(participants) + train_participants = shuffled_participants[:n_train] + test_participants = shuffled_participants[n_train:] + + # Select participants based on split + if self.split == "train": + selected_participants = train_participants + elif self.split == "test": + selected_participants = test_participants + else: # "all" + selected_participants = participants + + # Create samples for each participant + for participant_id in selected_participants: + self._create_participant_samples(participant_id) + + def _create_participant_samples(self, participant_id: int): + """Create samples for a single participant.""" + # Get participant data + hr_data = self.heart_rate_df[ + self.heart_rate_df["participant_id"] == participant_id + ].sort_values("date") + + steps_data = self.steps_df[ + self.steps_df["participant_id"] == participant_id + ].sort_values("date") + + sleep_data = self.sleep_df[ + self.sleep_df["participant_id"] == participant_id + ].sort_values("date") + + label_info = self.labels_df[ + self.labels_df["participant_id"] == participant_id + ].iloc[0] + + # Merge data on date + merged = hr_data.merge( + steps_data, on=["participant_id", "date"], how="outer" + ).merge( + sleep_data, on=["participant_id", "date"], how="outer" + ).sort_values("date") + + # Fill missing values with forward fill then backward fill + merged = merged.fillna(method="ffill").fillna(method="bfill") + + # Create sliding windows + for i in range(len(merged) - self.window_days + 1): + window_data = merged.iloc[i:i + self.window_days] + + # Determine label based on task type + if self.task == "detection": + # COVID-19 positive (1) or negative (0) during illness period + label = int(label_info["covid_positive"]) + else: # "prediction" + # Early detection: predict COVID-19 onset + # Check if window is before symptom onset + if pd.notna(label_info.get("symptom_onset_date")): + symptom_date = pd.to_datetime(label_info["symptom_onset_date"]) + window_end = window_data["date"].iloc[-1] + # Label as 1 if participant will develop COVID-19 + # and window is before symptom onset + if label_info["covid_positive"] == 1: + days_to_onset = (symptom_date - window_end).days + # Pre-symptomatic period (1-14 days before onset) + label = int(0 < days_to_onset <= 14) + else: + label = 0 + else: + label = 0 + + # Extract features + features = self._extract_features(window_data) + + # Create sample + sample = { + "participant_id": participant_id, + "window_start_date": window_data["date"].iloc[0], + "window_end_date": window_data["date"].iloc[-1], + "features": features, + "label": label, + } + + self.samples.append(sample) + + def _extract_features(self, window_data: pd.DataFrame) -> torch.Tensor: + """ + Extract features from a window of data. + + Parameters + ---------- + window_data : pd.DataFrame + DataFrame containing window_days rows of measurements. + + Returns + ------- + torch.Tensor + Feature tensor of shape (window_days, n_features). + """ + features = [] + + for _, row in window_data.iterrows(): + day_features = [ + row.get("resting_hr_mean", 0.0), + row.get("resting_hr_std", 0.0), + row.get("resting_hr_min", 0.0), + row.get("resting_hr_max", 0.0), + row.get("steps_total", 0.0), + row.get("steps_mean_hourly", 0.0), + row.get("sleep_duration_hours", 0.0), + row.get("sleep_efficiency", 0.0), + ] + features.append(day_features) + + return torch.tensor(features, dtype=torch.float32) + + def __len__(self) -> int: + """Return the number of samples in the dataset.""" + return len(self.samples) + + def __getitem__(self, idx: int) -> dict: + """ + Get a sample from the dataset. + + Parameters + ---------- + idx : int + Index of the sample. + + Returns + ------- + dict + Sample dictionary containing: + - participant_id: Participant identifier + - window_start_date: Start date of the window + - window_end_date: End date of the window + - features: Feature tensor of shape (window_days, n_features) + - label: Binary label (0 or 1) + """ + sample = self.samples[idx].copy() + + if self.transform: + sample = self.transform(sample) + + return sample + + def get_feature_names(self) -> list: + """Return the list of feature names.""" + return self.feature_names + + def get_label_distribution(self) -> dict: + """Return the distribution of labels in the dataset.""" + labels = [sample["label"] for sample in self.samples] + return { + "total_samples": len(labels), + "positive_samples": sum(labels), + "negative_samples": len(labels) - sum(labels), + "positive_ratio": sum(labels) / len(labels) if labels else 0.0, + } diff --git a/pyhealth/tasks/__init__.py b/pyhealth/tasks/__init__.py index fb3c6966a..01b0ebb37 100644 --- a/pyhealth/tasks/__init__.py +++ b/pyhealth/tasks/__init__.py @@ -62,3 +62,8 @@ MutationPathogenicityPrediction, VariantClassificationClinVar, ) +from .covidred import ( + covidred_detection_fn, + covidred_prediction_fn, + covidred_multiclass_fn, +) diff --git a/pyhealth/tasks/covidred_tasks.py b/pyhealth/tasks/covidred_tasks.py new file mode 100644 index 000000000..09a6adb9c --- /dev/null +++ b/pyhealth/tasks/covidred_tasks.py @@ -0,0 +1,201 @@ +""" +COVID-RED Classification Task Function for PyHealth + +This module provides task functions for COVID-19 detection and prediction +using the COVID-RED wearable device dataset. +""" + +from typing import Dict, List, Any +import torch + + +def covidred_detection_fn(sample: Dict[str, Any]) -> Dict[str, Any]: + """ + Task function for COVID-19 detection from wearable device data. + + This function processes a sample from the COVID-RED dataset and formats it + for PyHealth's standard pipeline. The task is to classify whether a participant + has COVID-19 based on their wearable device measurements (heart rate, steps, sleep). + + Parameters + ---------- + sample : Dict[str, Any] + A sample dictionary from COVIDREDDataset containing: + - participant_id: Participant identifier + - window_start_date: Start date of the measurement window + - window_end_date: End date of the measurement window + - features: Feature tensor of shape (window_days, n_features) + - label: Binary label (1=COVID-19 positive, 0=negative) + + Returns + ------- + Dict[str, Any] + Processed sample dictionary in PyHealth format: + - patient_id: Participant identifier (str) + - visit_id: Unique identifier for this window (str) + - signal: Time series tensor of shape (n_features, window_days) + - label: Binary classification label (int, 0 or 1) + - metadata: Additional information (dict) + + Examples + -------- + >>> from pyhealth.datasets import COVIDREDDataset + >>> from pyhealth.tasks import covidred_detection_fn + >>> + >>> dataset = COVIDREDDataset(root="/path/to/covidred", split="train") + >>> sample = dataset[0] + >>> processed_sample = covidred_detection_fn(sample) + >>> print(processed_sample.keys()) + dict_keys(['patient_id', 'visit_id', 'signal', 'label', 'metadata']) + + Notes + ----- + The signal tensor is transposed to shape (n_features, window_days) to match + PyHealth's expected format for time series data, where the first dimension + represents different feature channels and the second represents time steps. + """ + # Extract patient and visit identifiers + patient_id = str(sample["participant_id"]) + visit_id = f"{patient_id}_{sample['window_start_date'].strftime('%Y%m%d')}" + + # Transpose features from (window_days, n_features) to (n_features, window_days) + # This matches PyHealth's expected signal format + signal = sample["features"].transpose(0, 1) + + # Extract label + label = int(sample["label"]) + + # Create metadata + metadata = { + "window_start_date": sample["window_start_date"], + "window_end_date": sample["window_end_date"], + "window_days": signal.shape[1], + "n_features": signal.shape[0], + } + + return { + "patient_id": patient_id, + "visit_id": visit_id, + "signal": signal, + "label": label, + "metadata": metadata, + } + + +def covidred_prediction_fn(sample: Dict[str, Any]) -> Dict[str, Any]: + """ + Task function for early COVID-19 prediction from wearable device data. + + This function processes a sample from the COVID-RED dataset for the early + detection task - predicting COVID-19 onset before symptom appearance. + + Parameters + ---------- + sample : Dict[str, Any] + A sample dictionary from COVIDREDDataset containing: + - participant_id: Participant identifier + - window_start_date: Start date of the measurement window + - window_end_date: End date of the measurement window + - features: Feature tensor of shape (window_days, n_features) + - label: Binary label (1=pre-symptomatic period, 0=normal) + + Returns + ------- + Dict[str, Any] + Processed sample dictionary in PyHealth format: + - patient_id: Participant identifier (str) + - visit_id: Unique identifier for this window (str) + - signal: Time series tensor of shape (n_features, window_days) + - label: Binary prediction label (int, 0 or 1) + - metadata: Additional information (dict) + + Examples + -------- + >>> from pyhealth.datasets import COVIDREDDataset + >>> from pyhealth.tasks import covidred_prediction_fn + >>> + >>> dataset = COVIDREDDataset( + ... root="/path/to/covidred", + ... split="train", + ... task="prediction" + ... ) + >>> sample = dataset[0] + >>> processed_sample = covidred_prediction_fn(sample) + >>> print(f"Signal shape: {processed_sample['signal'].shape}") + >>> print(f"Label: {processed_sample['label']}") + + Notes + ----- + The prediction task focuses on identifying pre-symptomatic patterns in the + 1-14 days before symptom onset, which is critical for early intervention + and reducing transmission. + """ + # Use the same processing as detection task + # The distinction is in how the dataset creates labels + return covidred_detection_fn(sample) + + +def covidred_multiclass_fn(sample: Dict[str, Any]) -> Dict[str, Any]: + """ + Task function for multiclass COVID-19 severity classification. + + This function extends the basic detection to classify COVID-19 cases + into multiple severity categories: negative, mild, moderate, severe. + + Parameters + ---------- + sample : Dict[str, Any] + A sample dictionary from COVIDREDDataset with additional severity info. + + Returns + ------- + Dict[str, Any] + Processed sample dictionary with multiclass label: + - 0: COVID-19 negative + - 1: Mild (recovered at home, no assistance) + - 2: Moderate (recovered at home with assistance) + - 3: Severe (hospitalized) + + Examples + -------- + >>> from pyhealth.datasets import COVIDREDDataset + >>> from pyhealth.tasks import covidred_multiclass_fn + >>> + >>> # Assuming dataset includes severity information + >>> dataset = COVIDREDDataset(root="/path/to/covidred", split="train") + >>> sample = dataset[0] + >>> processed_sample = covidred_multiclass_fn(sample) + >>> print(f"Severity class: {processed_sample['label']}") + + Notes + ----- + This task requires the dataset to include severity information. + Check dataset documentation for availability. + """ + # Extract patient and visit identifiers + patient_id = str(sample["participant_id"]) + visit_id = f"{patient_id}_{sample['window_start_date'].strftime('%Y%m%d')}" + + # Transpose features + signal = sample["features"].transpose(0, 1) + + # Extract severity label if available + # Default to binary if severity not provided + label = sample.get("severity_label", sample["label"]) + + # Create metadata + metadata = { + "window_start_date": sample["window_start_date"], + "window_end_date": sample["window_end_date"], + "window_days": signal.shape[1], + "n_features": signal.shape[0], + "task_type": "multiclass_severity", + } + + return { + "patient_id": patient_id, + "visit_id": visit_id, + "signal": signal, + "label": int(label), + "metadata": metadata, + } From d5c1f91faff58149324901390cc4e95787180963 Mon Sep 17 00:00:00 2001 From: jpolsgrove Date: Sun, 7 Dec 2025 20:50:03 -0600 Subject: [PATCH 2/3] Add files via upload --- pyhealth/datasets/covidred_dataset.py | 312 ++++++++++++++++++++++++++ 1 file changed, 312 insertions(+) create mode 100644 pyhealth/datasets/covidred_dataset.py diff --git a/pyhealth/datasets/covidred_dataset.py b/pyhealth/datasets/covidred_dataset.py new file mode 100644 index 000000000..be85486d1 --- /dev/null +++ b/pyhealth/datasets/covidred_dataset.py @@ -0,0 +1,312 @@ +""" +COVID-RED Dataset Loader for PyHealth + +This module implements a dataset loader for the COVID-RED (Remote Early Detection +of SARS-CoV-2 infections) dataset from Utrecht University. + +Dataset: https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7 +""" + +import os +import pandas as pd +import torch +from torch.utils.data import Dataset +from typing import Optional, Callable, Literal + + +class COVIDREDDataset(Dataset): + """ + COVID-RED Dataset for early detection of COVID-19 from wearable device data. + + The COVID-RED dataset contains wearable device measurements (heart rate, steps, sleep) + from participants during the COVID-19 pandemic, collected to enable early detection + of SARS-CoV-2 infections before symptom onset. + + Parameters + ---------- + root : str + Root directory containing the COVID-RED dataset files. + Expected files: + - heart_rate.csv (daily heart rate measurements) + - steps.csv (daily step counts) + - sleep.csv (daily sleep duration) + - labels.csv (COVID-19 test results and symptom dates) + + split : Literal["train", "test", "all"], default="train" + Which split of the data to use. + - "train": Training set (70% of participants) + - "test": Test set (30% of participants) + - "all": All data + + window_days : int, default=7 + Number of days to include in each sample window. + + task : Literal["detection", "prediction"], default="detection" + Task type: + - "detection": Classify COVID-19 positive vs negative during illness period + - "prediction": Predict COVID-19 onset before symptom onset (early detection) + + transform : Optional[Callable], default=None + Optional transform to be applied on a sample. + + random_seed : int, default=42 + Random seed for train/test split reproducibility. + + Attributes + ---------- + samples : list + List of sample dictionaries containing features and labels. + + feature_names : list + Names of features included in each sample. + + Examples + -------- + >>> from pyhealth.datasets import COVIDREDDataset + >>> dataset = COVIDREDDataset( + ... root="/path/to/covidred", + ... split="train", + ... window_days=7, + ... task="prediction" + ... ) + >>> print(f"Dataset size: {len(dataset)}") + >>> sample = dataset[0] + >>> print(f"Features shape: {sample['features'].shape}") + >>> print(f"Label: {sample['label']}") + + Notes + ----- + The dataset must be manually downloaded from: + https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7 + + Citation: + Olthof, A.W., Schut, A., van Beijnum, B.F. et al. (2021). + Remote Early Detection of SARS-CoV-2 infections (COVID-RED). + DataverseNL. https://doi.org/10.34894/FW9PO7 + """ + + def __init__( + self, + root: str, + split: Literal["train", "test", "all"] = "train", + window_days: int = 7, + task: Literal["detection", "prediction"] = "detection", + transform: Optional[Callable] = None, + random_seed: int = 42, + ): + self.root = root + self.split = split + self.window_days = window_days + self.task = task + self.transform = transform + self.random_seed = random_seed + + # Feature names for each data type + self.feature_names = [ + "resting_hr_mean", + "resting_hr_std", + "resting_hr_min", + "resting_hr_max", + "steps_total", + "steps_mean_hourly", + "sleep_duration_hours", + "sleep_efficiency", + ] + + # Load and process the dataset + self._load_data() + self._create_samples() + + def _load_data(self): + """Load CSV files from the dataset directory.""" + # Check if required files exist + required_files = ["heart_rate.csv", "steps.csv", "sleep.csv", "labels.csv"] + for file in required_files: + file_path = os.path.join(self.root, file) + if not os.path.exists(file_path): + raise FileNotFoundError( + f"Required file '{file}' not found in {self.root}. " + f"Please download the COVID-RED dataset from: " + f"https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7" + ) + + # Load data files + self.heart_rate_df = pd.read_csv(os.path.join(self.root, "heart_rate.csv")) + self.steps_df = pd.read_csv(os.path.join(self.root, "steps.csv")) + self.sleep_df = pd.read_csv(os.path.join(self.root, "sleep.csv")) + self.labels_df = pd.read_csv(os.path.join(self.root, "labels.csv")) + + # Convert date columns to datetime + for df in [self.heart_rate_df, self.steps_df, self.sleep_df, self.labels_df]: + if "date" in df.columns: + df["date"] = pd.to_datetime(df["date"]) + + def _create_samples(self): + """Create samples with sliding windows.""" + self.samples = [] + + # Get unique participants + participants = self.labels_df["participant_id"].unique() + + # Split participants into train/test + import numpy as np + np.random.seed(self.random_seed) + n_train = int(len(participants) * 0.7) + shuffled_participants = np.random.permutation(participants) + train_participants = shuffled_participants[:n_train] + test_participants = shuffled_participants[n_train:] + + # Select participants based on split + if self.split == "train": + selected_participants = train_participants + elif self.split == "test": + selected_participants = test_participants + else: # "all" + selected_participants = participants + + # Create samples for each participant + for participant_id in selected_participants: + self._create_participant_samples(participant_id) + + def _create_participant_samples(self, participant_id: int): + """Create samples for a single participant.""" + # Get participant data + hr_data = self.heart_rate_df[ + self.heart_rate_df["participant_id"] == participant_id + ].sort_values("date") + + steps_data = self.steps_df[ + self.steps_df["participant_id"] == participant_id + ].sort_values("date") + + sleep_data = self.sleep_df[ + self.sleep_df["participant_id"] == participant_id + ].sort_values("date") + + label_info = self.labels_df[ + self.labels_df["participant_id"] == participant_id + ].iloc[0] + + # Merge data on date + merged = hr_data.merge( + steps_data, on=["participant_id", "date"], how="outer" + ).merge( + sleep_data, on=["participant_id", "date"], how="outer" + ).sort_values("date") + + # Fill missing values with forward fill then backward fill + merged = merged.fillna(method="ffill").fillna(method="bfill") + + # Create sliding windows + for i in range(len(merged) - self.window_days + 1): + window_data = merged.iloc[i:i + self.window_days] + + # Determine label based on task type + if self.task == "detection": + # COVID-19 positive (1) or negative (0) during illness period + label = int(label_info["covid_positive"]) + else: # "prediction" + # Early detection: predict COVID-19 onset + # Check if window is before symptom onset + if pd.notna(label_info.get("symptom_onset_date")): + symptom_date = pd.to_datetime(label_info["symptom_onset_date"]) + window_end = window_data["date"].iloc[-1] + # Label as 1 if participant will develop COVID-19 + # and window is before symptom onset + if label_info["covid_positive"] == 1: + days_to_onset = (symptom_date - window_end).days + # Pre-symptomatic period (1-14 days before onset) + label = int(0 < days_to_onset <= 14) + else: + label = 0 + else: + label = 0 + + # Extract features + features = self._extract_features(window_data) + + # Create sample + sample = { + "participant_id": participant_id, + "window_start_date": window_data["date"].iloc[0], + "window_end_date": window_data["date"].iloc[-1], + "features": features, + "label": label, + } + + self.samples.append(sample) + + def _extract_features(self, window_data: pd.DataFrame) -> torch.Tensor: + """ + Extract features from a window of data. + + Parameters + ---------- + window_data : pd.DataFrame + DataFrame containing window_days rows of measurements. + + Returns + ------- + torch.Tensor + Feature tensor of shape (window_days, n_features). + """ + features = [] + + for _, row in window_data.iterrows(): + day_features = [ + row.get("resting_hr_mean", 0.0), + row.get("resting_hr_std", 0.0), + row.get("resting_hr_min", 0.0), + row.get("resting_hr_max", 0.0), + row.get("steps_total", 0.0), + row.get("steps_mean_hourly", 0.0), + row.get("sleep_duration_hours", 0.0), + row.get("sleep_efficiency", 0.0), + ] + features.append(day_features) + + return torch.tensor(features, dtype=torch.float32) + + def __len__(self) -> int: + """Return the number of samples in the dataset.""" + return len(self.samples) + + def __getitem__(self, idx: int) -> dict: + """ + Get a sample from the dataset. + + Parameters + ---------- + idx : int + Index of the sample. + + Returns + ------- + dict + Sample dictionary containing: + - participant_id: Participant identifier + - window_start_date: Start date of the window + - window_end_date: End date of the window + - features: Feature tensor of shape (window_days, n_features) + - label: Binary label (0 or 1) + """ + sample = self.samples[idx].copy() + + if self.transform: + sample = self.transform(sample) + + return sample + + def get_feature_names(self) -> list: + """Return the list of feature names.""" + return self.feature_names + + def get_label_distribution(self) -> dict: + """Return the distribution of labels in the dataset.""" + labels = [sample["label"] for sample in self.samples] + return { + "total_samples": len(labels), + "positive_samples": sum(labels), + "negative_samples": len(labels) - sum(labels), + "positive_ratio": sum(labels) / len(labels) if labels else 0.0, + } From 6022e425d839ef924a8b6c2de1a1e083a27cb407 Mon Sep 17 00:00:00 2001 From: Jennifer Polsgrove Date: Sun, 7 Dec 2025 21:05:27 -0600 Subject: [PATCH 3/3] corrected dataset --- pyhealth/datasets/covidred_dataset.py | 397 +++++++++++++++----------- 1 file changed, 228 insertions(+), 169 deletions(-) diff --git a/pyhealth/datasets/covidred_dataset.py b/pyhealth/datasets/covidred_dataset.py index be85486d1..ad0e5f9ec 100644 --- a/pyhealth/datasets/covidred_dataset.py +++ b/pyhealth/datasets/covidred_dataset.py @@ -26,25 +26,34 @@ class COVIDREDDataset(Dataset): ---------- root : str Root directory containing the COVID-RED dataset files. - Expected files: - - heart_rate.csv (daily heart rate measurements) - - steps.csv (daily step counts) - - sleep.csv (daily sleep duration) - - labels.csv (COVID-19 test results and symptom dates) + Expected files (from DataverseNL download): + - bc_20230515.csv (baseline characteristics) + - ct_20230515.csv (COVID-19 test results) + - cv_20230515.csv (COVID-19 vaccination) + - dm_20230515.csv (daily measurements - heart rate, steps, etc.) + - field_options.csv (field value mappings) + - ho_20230515.csv (hospitalization) + - hu_20230515.csv (healthcare utilization) + - ie_20230515.csv (illness episodes) + - mh_20230515.csv (medical history) + - ov_20230515.csv (overview/participant info) + - pcr_20230515.csv (PCR test results) + - sc_20230515.csv (symptom checklist) + - ser_20230515.csv (serology results) + - si_20230515.csv (symptom information) + - variable_descriptions.csv (data dictionary) + - wd_20230515.csv (wearable device data) split : Literal["train", "test", "all"], default="train" Which split of the data to use. - - "train": Training set (70% of participants) - - "test": Test set (30% of participants) - - "all": All data window_days : int, default=7 Number of days to include in each sample window. task : Literal["detection", "prediction"], default="detection" Task type: - - "detection": Classify COVID-19 positive vs negative during illness period - - "prediction": Predict COVID-19 onset before symptom onset (early detection) + - "detection": Classify COVID-19 positive vs negative + - "prediction": Predict COVID-19 onset before symptom onset transform : Optional[Callable], default=None Optional transform to be applied on a sample. @@ -52,14 +61,6 @@ class COVIDREDDataset(Dataset): random_seed : int, default=42 Random seed for train/test split reproducibility. - Attributes - ---------- - samples : list - List of sample dictionaries containing features and labels. - - feature_names : list - Names of features included in each sample. - Examples -------- >>> from pyhealth.datasets import COVIDREDDataset @@ -72,17 +73,10 @@ class COVIDREDDataset(Dataset): >>> print(f"Dataset size: {len(dataset)}") >>> sample = dataset[0] >>> print(f"Features shape: {sample['features'].shape}") - >>> print(f"Label: {sample['label']}") Notes ----- - The dataset must be manually downloaded from: - https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7 - - Citation: - Olthof, A.W., Schut, A., van Beijnum, B.F. et al. (2021). - Remote Early Detection of SARS-CoV-2 infections (COVID-RED). - DataverseNL. https://doi.org/10.34894/FW9PO7 + Download from: https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7 """ def __init__( @@ -101,10 +95,10 @@ def __init__( self.transform = transform self.random_seed = random_seed - # Feature names for each data type + # Feature names self.feature_names = [ "resting_hr_mean", - "resting_hr_std", + "resting_hr_std", "resting_hr_min", "resting_hr_max", "steps_total", @@ -118,178 +112,243 @@ def __init__( self._create_samples() def _load_data(self): - """Load CSV files from the dataset directory.""" + """Load CSV files from the COVID-RED dataset directory.""" # Check if required files exist - required_files = ["heart_rate.csv", "steps.csv", "sleep.csv", "labels.csv"] - for file in required_files: - file_path = os.path.join(self.root, file) + required_files = { + 'daily_measurements': 'dm_20230515.csv', + 'wearable_data': 'wd_20230515.csv', + 'covid_tests': 'ct_20230515.csv', + 'symptom_info': 'si_20230515.csv', + 'illness_episodes': 'ie_20230515.csv', + 'overview': 'ov_20230515.csv', + } + + missing_files = [] + for name, filename in required_files.items(): + file_path = os.path.join(self.root, filename) if not os.path.exists(file_path): - raise FileNotFoundError( - f"Required file '{file}' not found in {self.root}. " - f"Please download the COVID-RED dataset from: " - f"https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7" - ) - - # Load data files - self.heart_rate_df = pd.read_csv(os.path.join(self.root, "heart_rate.csv")) - self.steps_df = pd.read_csv(os.path.join(self.root, "steps.csv")) - self.sleep_df = pd.read_csv(os.path.join(self.root, "sleep.csv")) - self.labels_df = pd.read_csv(os.path.join(self.root, "labels.csv")) - - # Convert date columns to datetime - for df in [self.heart_rate_df, self.steps_df, self.sleep_df, self.labels_df]: - if "date" in df.columns: - df["date"] = pd.to_datetime(df["date"]) + missing_files.append(filename) + + if missing_files: + raise FileNotFoundError( + f"Required files not found in {self.root}:\n" + f"{', '.join(missing_files)}\n\n" + f"Please download the COVID-RED dataset from:\n" + f"https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/FW9PO7\n\n" + f"Expected files:\n" + + "\n".join(f" - {f}" for f in required_files.values()) + ) + + # Load main data files + print(f"Loading COVID-RED dataset from {self.root}...") + + self.daily_measurements = pd.read_csv(os.path.join(self.root, 'dm_20230515.csv')) + self.wearable_data = pd.read_csv(os.path.join(self.root, 'wd_20230515.csv')) + self.covid_tests = pd.read_csv(os.path.join(self.root, 'ct_20230515.csv')) + self.symptom_info = pd.read_csv(os.path.join(self.root, 'si_20230515.csv')) + self.illness_episodes = pd.read_csv(os.path.join(self.root, 'ie_20230515.csv')) + self.overview = pd.read_csv(os.path.join(self.root, 'ov_20230515.csv')) + + print(f"✓ Loaded {len(self.overview)} participants") + print(f"✓ Daily measurements: {len(self.daily_measurements)} records") + print(f"✓ Wearable data: {len(self.wearable_data)} records") + # Convert date columns + self._convert_dates() + + def _convert_dates(self): + """Convert date columns to datetime format.""" + date_columns_map = { + 'daily_measurements': ['date', 'measurement_date'], + 'wearable_data': ['date', 'wear_date'], + 'covid_tests': ['test_date', 'result_date'], + 'symptom_info': ['symptom_date', 'onset_date'], + 'illness_episodes': ['start_date', 'end_date'], + } + + for df_name, possible_cols in date_columns_map.items(): + df = getattr(self, df_name) + for col in possible_cols: + if col in df.columns: + try: + df[col] = pd.to_datetime(df[col], errors='coerce') + except: + pass + def _create_samples(self): """Create samples with sliding windows.""" self.samples = [] # Get unique participants - participants = self.labels_df["participant_id"].unique() + id_col = self._find_id_column(self.overview) + participants = self.overview[id_col].unique() - # Split participants into train/test + # Split participants import numpy as np np.random.seed(self.random_seed) n_train = int(len(participants) * 0.7) - shuffled_participants = np.random.permutation(participants) - train_participants = shuffled_participants[:n_train] - test_participants = shuffled_participants[n_train:] + shuffled = np.random.permutation(participants) - # Select participants based on split if self.split == "train": - selected_participants = train_participants + selected = shuffled[:n_train] elif self.split == "test": - selected_participants = test_participants - else: # "all" - selected_participants = participants + selected = shuffled[n_train:] + else: + selected = participants + + print(f"\nCreating samples for {len(selected)} participants...") - # Create samples for each participant - for participant_id in selected_participants: + for participant_id in selected: self._create_participant_samples(participant_id) + + print(f"✓ Created {len(self.samples)} samples") + + def _find_id_column(self, df): + """Find the participant ID column in a dataframe.""" + for col in ['participant_id', 'subject_id', 'id', 'user_id']: + if col in df.columns: + return col + return df.columns[0] - def _create_participant_samples(self, participant_id: int): + def _create_participant_samples(self, participant_id): """Create samples for a single participant.""" + id_col = self._find_id_column(self.daily_measurements) + # Get participant data - hr_data = self.heart_rate_df[ - self.heart_rate_df["participant_id"] == participant_id - ].sort_values("date") - - steps_data = self.steps_df[ - self.steps_df["participant_id"] == participant_id - ].sort_values("date") - - sleep_data = self.sleep_df[ - self.sleep_df["participant_id"] == participant_id - ].sort_values("date") - - label_info = self.labels_df[ - self.labels_df["participant_id"] == participant_id - ].iloc[0] - - # Merge data on date - merged = hr_data.merge( - steps_data, on=["participant_id", "date"], how="outer" - ).merge( - sleep_data, on=["participant_id", "date"], how="outer" - ).sort_values("date") - - # Fill missing values with forward fill then backward fill - merged = merged.fillna(method="ffill").fillna(method="bfill") - - # Create sliding windows - for i in range(len(merged) - self.window_days + 1): - window_data = merged.iloc[i:i + self.window_days] + data = self.daily_measurements[ + self.daily_measurements[id_col] == participant_id + ].copy() + + if len(data) == 0: + return + + # Find date column + date_col = None + for col in ['date', 'measurement_date', 'day', 'record_date']: + if col in data.columns: + date_col = col + break + + if not date_col: + return + + data = data.sort_values(date_col) + + # Get COVID label + covid_positive, symptom_date = self._get_covid_label(participant_id) + + # Create windows + for i in range(len(data) - self.window_days + 1): + window = data.iloc[i:i + self.window_days] - # Determine label based on task type + window_start = window[date_col].iloc[0] + window_end = window[date_col].iloc[-1] + + # Determine label if self.task == "detection": - # COVID-19 positive (1) or negative (0) during illness period - label = int(label_info["covid_positive"]) - else: # "prediction" - # Early detection: predict COVID-19 onset - # Check if window is before symptom onset - if pd.notna(label_info.get("symptom_onset_date")): - symptom_date = pd.to_datetime(label_info["symptom_onset_date"]) - window_end = window_data["date"].iloc[-1] - # Label as 1 if participant will develop COVID-19 - # and window is before symptom onset - if label_info["covid_positive"] == 1: + label = covid_positive + else: # prediction + label = 0 + if covid_positive == 1 and symptom_date is not None: + if pd.notna(symptom_date) and pd.notna(window_end): days_to_onset = (symptom_date - window_end).days - # Pre-symptomatic period (1-14 days before onset) label = int(0 < days_to_onset <= 14) - else: - label = 0 - else: - label = 0 # Extract features - features = self._extract_features(window_data) - - # Create sample - sample = { - "participant_id": participant_id, - "window_start_date": window_data["date"].iloc[0], - "window_end_date": window_data["date"].iloc[-1], - "features": features, - "label": label, - } + features = self._extract_features(window) - self.samples.append(sample) + if features is not None: + self.samples.append({ + "participant_id": participant_id, + "window_start_date": window_start, + "window_end_date": window_end, + "features": features, + "label": label, + }) + + def _get_covid_label(self, participant_id): + """Get COVID-19 label for a participant.""" + id_col = self._find_id_column(self.covid_tests) + + tests = self.covid_tests[self.covid_tests[id_col] == participant_id] + + # Check for positive result + covid_positive = 0 + for col in ['test_result', 'result', 'pcr_result', 'outcome', 'positive']: + if col in tests.columns and len(tests) > 0: + results = tests[col].astype(str).str.lower() + if any(r in ['positive', '1', 'true', 'pos'] for r in results): + covid_positive = 1 + break + + # Get symptom onset + symptom_date = None + id_col_symptom = self._find_id_column(self.symptom_info) + symptoms = self.symptom_info[self.symptom_info[id_col_symptom] == participant_id] + + if len(symptoms) > 0: + for col in ['onset_date', 'symptom_date', 'start_date']: + if col in symptoms.columns: + dates = symptoms[col].dropna() + if len(dates) > 0: + symptom_date = pd.to_datetime(dates.iloc[0]) + break + + return covid_positive, symptom_date - def _extract_features(self, window_data: pd.DataFrame) -> torch.Tensor: - """ - Extract features from a window of data. - - Parameters - ---------- - window_data : pd.DataFrame - DataFrame containing window_days rows of measurements. - - Returns - ------- - torch.Tensor - Feature tensor of shape (window_days, n_features). - """ + def _extract_features(self, window_data): + """Extract features from a window.""" + feature_mapping = { + 'resting_hr_mean': ['hr_mean', 'heart_rate_mean', 'resting_hr', 'hr_avg'], + 'resting_hr_std': ['hr_std', 'heart_rate_std', 'hr_sd'], + 'resting_hr_min': ['hr_min', 'heart_rate_min'], + 'resting_hr_max': ['hr_max', 'heart_rate_max'], + 'steps_total': ['steps', 'step_count', 'daily_steps', 'total_steps'], + 'steps_mean_hourly': ['steps_per_hour', 'hourly_steps'], + 'sleep_duration_hours': ['sleep_hours', 'sleep_duration', 'total_sleep'], + 'sleep_efficiency': ['sleep_eff', 'sleep_quality'], + } + features = [] for _, row in window_data.iterrows(): - day_features = [ - row.get("resting_hr_mean", 0.0), - row.get("resting_hr_std", 0.0), - row.get("resting_hr_min", 0.0), - row.get("resting_hr_max", 0.0), - row.get("steps_total", 0.0), - row.get("steps_mean_hourly", 0.0), - row.get("sleep_duration_hours", 0.0), - row.get("sleep_efficiency", 0.0), - ] + day_features = [] + + for feature_name in self.feature_names: + value = 0.0 + possible_cols = feature_mapping.get(feature_name, [feature_name]) + + for col in possible_cols: + if col in row.index and pd.notna(row[col]): + value = float(row[col]) + break + + # Calculate derived features + if feature_name == 'steps_mean_hourly' and value == 0.0: + for col in feature_mapping['steps_total']: + if col in row.index and pd.notna(row[col]): + value = float(row[col]) / 24.0 + break + + day_features.append(value) + features.append(day_features) - return torch.tensor(features, dtype=torch.float32) + try: + tensor = torch.tensor(features, dtype=torch.float32) + if tensor.shape == (self.window_days, len(self.feature_names)): + return tensor + except: + pass + + return None - def __len__(self) -> int: - """Return the number of samples in the dataset.""" + def __len__(self): + """Return the number of samples.""" return len(self.samples) - def __getitem__(self, idx: int) -> dict: - """ - Get a sample from the dataset. - - Parameters - ---------- - idx : int - Index of the sample. - - Returns - ------- - dict - Sample dictionary containing: - - participant_id: Participant identifier - - window_start_date: Start date of the window - - window_end_date: End date of the window - - features: Feature tensor of shape (window_days, n_features) - - label: Binary label (0 or 1) - """ + def __getitem__(self, idx): + """Get a sample.""" sample = self.samples[idx].copy() if self.transform: @@ -297,13 +356,13 @@ def __getitem__(self, idx: int) -> dict: return sample - def get_feature_names(self) -> list: - """Return the list of feature names.""" + def get_feature_names(self): + """Return feature names.""" return self.feature_names - def get_label_distribution(self) -> dict: - """Return the distribution of labels in the dataset.""" - labels = [sample["label"] for sample in self.samples] + def get_label_distribution(self): + """Return label distribution.""" + labels = [s["label"] for s in self.samples] return { "total_samples": len(labels), "positive_samples": sum(labels),