From 3a2582c559026168c2c4d2dd0a4190fb2cf30195 Mon Sep 17 00:00:00 2001 From: William Yue Date: Mon, 5 Jan 2026 14:21:30 -0800 Subject: [PATCH] fix --- src/opentau/datasets/grounding/pixmo.py | 30 ------------------------ src/opentau/datasets/grounding/vsr.py | 31 ------------------------- 2 files changed, 61 deletions(-) diff --git a/src/opentau/datasets/grounding/pixmo.py b/src/opentau/datasets/grounding/pixmo.py index 148ff9d..f1ec8b9 100644 --- a/src/opentau/datasets/grounding/pixmo.py +++ b/src/opentau/datasets/grounding/pixmo.py @@ -15,36 +15,6 @@ This module provides the PIXMO (Pixel-level Manipulation) dataset implementation for training vision-language models on part localization and object grounding tasks. - -The dataset contains images with point annotations for object parts, enabling models -to learn fine-grained spatial understanding. - -The dataset is loaded from HuggingFace (allenai/pixmo-points) and includes -automatic retry logic for handling image download failures. Point coordinates -are normalized to a 255x255 grid and formatted as JSON strings in the postfix. - -Classes: - PixmoDataset: Dataset class that loads and formats PIXMO data for part - localization tasks. - -Functions: - _pil_from_url: Download and decode an image from URL with retry logic. - _get_post_fix: Convert point coordinates to normalized grid format and - format as JSON string. - _img_to_normalized_tensor: Convert PIL Image to normalized torch tensor. - -Constants: - IMG_SIZE: Target image size (224x224). - POINT_GRID: Grid size for point normalization (255x255). - MAX_RETRIES: Maximum HTTP retry attempts. - HTTP_TIMEOUT: HTTP request timeout in seconds. - -Example: - Use PIXMO dataset in training:: - - >>> from opentau.configs.default import DatasetConfig - >>> cfg = DatasetConfig(grounding="pixmo") - >>> dataset = make_dataset(cfg, train_cfg) """ import json diff --git a/src/opentau/datasets/grounding/vsr.py b/src/opentau/datasets/grounding/vsr.py index 25d5dce..2917918 100644 --- a/src/opentau/datasets/grounding/vsr.py +++ b/src/opentau/datasets/grounding/vsr.py @@ -17,37 +17,6 @@ models on visual spatial reasoning tasks. The dataset contains images with statements about spatial relationships, and models must determine whether each statement is true or false based on the image content. - -The dataset is loaded from HuggingFace (cambridgeltl/vsr_random) and includes -automatic retry logic for handling image download failures. Statements are -formatted as grounding tasks with true/false labels. - -Key Features: - * Spatial reasoning: Tests understanding of spatial relationships between - objects in images. - * Binary classification: Simple true/false format for clear learning signal. - * Robust loading: Automatic retry with random sampling for failed image - downloads. - -Classes: - VSRDataset: Dataset class that loads and formats VSR data for true/false - spatial reasoning tasks. - -Functions: - _pil_from_url: Download and decode an image from URL with retry logic. - _img_to_normalized_tensor: Convert PIL Image to normalized torch tensor - with channel-first format and [0, 1] normalization. - -Constants: - MAX_RETRIES: Maximum HTTP retry attempts. - HTTP_TIMEOUT: HTTP request timeout in seconds. - -Example: - Use VSR dataset in training:: - - >>> from opentau.configs.default import DatasetConfig - >>> cfg = DatasetConfig(grounding="vsr") - >>> dataset = make_dataset(cfg, train_cfg) """ import logging