diff --git a/DESCRIPTION b/DESCRIPTION index 47fc9ec7..5fe92f4c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -80,6 +80,7 @@ Collate: 'dataset-places365.R' 'dataset-plankton.R' 'dataset-rf100-peixos.R' + 'dataset-vggface2.R' 'extension.R' 'globals.R' 'imagenet.R' diff --git a/NAMESPACE b/NAMESPACE index 90693b1b..7a027459 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -214,6 +214,7 @@ export(transform_rotate) export(transform_ten_crop) export(transform_to_tensor) export(transform_vflip) +export(vggface2_dataset) export(vision_make_grid) export(whoi_plankton_dataset) export(whoi_small_coralnet_dataset) diff --git a/NEWS.md b/NEWS.md index ccd6202a..d199b4da 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,11 +5,13 @@ * Added `lfw_people_dataset()` and `lfw_pairs_dataset()` for loading Labelled Faces in the Wild (LFW) datasets (@DerrickUnleashed, #203). * Added `places365_dataset()`for loading the Places365 dataset (@koshtiakanksha, #196). * Added `pascal_segmentation_dataset()`, and `pascal_detection_dataset()` for loading the Pascal Visual Object Classes datasets (@DerrickUnleashed, #209). +* Added `whoi_plankton_dataset()`, and `whoi_small_plankton_dataset()` (@cregouby, #236). * Added `whoi_plankton_dataset()`, `whoi_small_plankton_dataset()`, and `whoi_small_coral_dataset()` (@cregouby, #236). * Added `rf100_document_collection()`, `rf100_medical_collection()`, `rf100_biology_collection()`, `rf100_damage_collection()`, `rf100_infrared_collection()`, and `rf100_underwater_collection()` . Those are collection of datasets from RoboFlow 100 under the same thematic, for a total of 35 datasets (@koshtiakanksha, @cregouby, #239). * Added `rf100_peixos_segmentation_dataset()`. (@koshtiakanksha, @cregouby, #250). +* Added `vggface2_dataset()` for loading the VGGFace2 dataset (@DerrickUnleashed, #238). ## New models diff --git a/R/dataset-vggface2.R b/R/dataset-vggface2.R new file mode 100644 index 00000000..7019989b --- /dev/null +++ b/R/dataset-vggface2.R @@ -0,0 +1,185 @@ +#' VGGFace2 Dataset +#' +#' The VGGFace2 dataset is a large-scale face recognition dataset containing images +#' of celebrities from a wide range of ethnicities, professions, and ages. +#' Each identity has multiple images with large variations in pose, age, illumination, +#' ethnicity, and profession. +#' +#' @inheritParams oxfordiiitpet_dataset +#' @param root Character. Root directory where the dataset will be stored under `root/vggface2`. +#' +#' @return A torch dataset object `vggface2_dataset`: +#' - `x`: RGB image array. +#' - `y`: Integer label (1…N) for the identity. +#' +#' `ds$classes` is a named list mapping integer labels to a list with: +#' - `name`: Character name of the person. +#' - `gender`: "Male" or "Female". +#' +#' @examples +#' \dontrun{ +#' #Load the training set +#' ds <- vggface2_dataset(download = TRUE) +#' item <- ds[1] +#' item$x # image array RGB +#' item$y # integer label +#' ds$classes[item$y] # list(name=..., gender=...) +#' +#' #Load the test set +#' ds <- vggface2_dataset(download = TRUE, train = FALSE) +#' item <- ds[1] +#' item$x # image array RGB +#' item$y # integer label +#' ds$classes[item$y] # list(name=..., gender=...) +#' } +#' +#' @family segmentation_dataset +#' @export +vggface2_dataset <- torch::dataset( + name = "vggface2", + resources = data.frame( + split = c("train_images", "test_images", "train_list", "test_list", "identity"), + url = c( + "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_train.tar.gz", + "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_test.tar.gz", + "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/train_list.txt", + "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/test_list.txt", + "https://huggingface.co/datasets/ProgramComputer/VGGFace2/raw/main/meta/identity_meta.csv" + ), + md5 = c( + "88813c6b15de58afc8fa75ea83361d7f", + "bb7a323824d1004e14e00c23974facd3", + "4cfbab4a839163f454d7ecef28b68669", + "d08b10f12bc9889509364ef56d73c621", + "d315386c7e8e166c4f60e27d9cc61acc" + ) + ), + training_file = "train.rds", + test_file = "test.rds", + + initialize = function( + root = tempdir(), + train = TRUE, + transform = NULL, + target_transform = NULL, + download = FALSE + ) { + self$root_path <- root + self$transform <- transform + self$target_transform <- target_transform + if (train) { + self$split <- "train" + self$archive_size <- "36 GB" + } else { + self$split <- "test" + self$archive_size <- "2 GB" + } + + if (download) { + cli_inform("Dataset {.cls {class(self)[[1]]}} (~{.emph {self$archive_size}}) will be downloaded and processed if not already available.") + self$download() + } + + if (!self$check_exists()) { + cli_abort("Dataset not found. You can use `download = TRUE` to download it.") + } + + if (train) { + data_file <- self$training_file + } else { + data_file <- self$test_file + } + data <- readRDS(file.path(self$processed_folder, data_file)) + + self$img_path <- data$img_path + self$labels <- data$labels + self$classes <- data$classes + + cli_inform("{.cls {class(self)[[1]]}} dataset loaded with {self$.length()} images across {length(self$classes)} classes.") + }, + + download = function() { + if (self$check_exists()) { + return() + } + + fs::dir_create(self$raw_folder) + fs::dir_create(self$processed_folder) + + cli_inform("Downloading {.cls {class(self)[[1]]}}...") + + for (i in seq_len(nrow(self$resources))) { + row <- self$resources[i, ] + archive <- download_and_cache(row$url, prefix = row$split) + if (tools::md5sum(archive) != row$md5) { + runtime_error("Corrupt file! Delete the file in {archive} and try again.") + } + if (tools::file_ext(row$url) == "gz") { + utils::untar(archive, exdir = self$raw_folder) + } else { + fs::file_move(archive, self$raw_folder) + } + } + + identity_file <- file.path(self$raw_folder, "identity_meta.csv") + identity_df <- read.csv(identity_file, sep = ",", stringsAsFactors = FALSE, strip.white = TRUE) + identity_df$Class_ID <- trimws(identity_df$Class_ID) + identity_df$Gender <- factor(identity_df$Gender, levels = c("f", "m"), labels = c("Female", "Male")) + + for (split in c("train", "test")) { + if (split == "train") { + list_file <- file.path(self$raw_folder, "train_list.txt") + } else { + list_file <- file.path(self$raw_folder, "test_list.txt") + } + + split_df <- read.delim( + list_file, + sep = "/", + col.names = c("Class_ID", "img_path"), + header = FALSE, + stringsAsFactors = FALSE + ) + + merged_df <- merge(split_df, identity_df, by = "Class_ID", all.x = TRUE) + merged_df$Label <- as.integer(factor(merged_df$Class_ID, levels = unique(merged_df$Class_ID))) + + saveRDS( + merged_df, + file.path(self$processed_folder, paste0(split, ".rds")) + ) + } + cli_inform("Dataset {.cls {class(self)[[1]]}} downloaded and extracted successfully.") + }, + + check_exists = function() { + fs::file_exists(file.path(self$processed_folder, self$training_file)) && + fs::file_exists(file.path(self$processed_folder, self$test_file)) + }, + + .getitem = function(index) { + x <- jpeg::readJPEG(self$img_path[index]) + y <- self$labels[index] + + if (!is.null(self$transform)) { + x <- self$transform(x) + } + if (!is.null(self$target_transform)) { + y <- self$target_transform(y) + } + list(x = x, y = y) + }, + + .length = function() { + length(self$img_path) + }, + + active = list( + raw_folder = function() { + file.path(self$root_path, "vggface2", "raw") + }, + processed_folder = function() { + file.path(self$root_path, "vggface2", "processed") + } + ) +) \ No newline at end of file diff --git a/man/oxfordiiitpet_segmentation_dataset.Rd b/man/oxfordiiitpet_segmentation_dataset.Rd index c1a059ad..aae686e2 100644 --- a/man/oxfordiiitpet_segmentation_dataset.Rd +++ b/man/oxfordiiitpet_segmentation_dataset.Rd @@ -69,5 +69,6 @@ tensor_image_browse(overlay) Other segmentation_dataset: \code{\link{pascal_voc_datasets}}, \code{\link{rf100_peixos_segmentation_dataset}()} +\code{\link{vggface2_dataset}()} } \concept{segmentation_dataset} diff --git a/man/pascal_voc_datasets.Rd b/man/pascal_voc_datasets.Rd index 24c198e4..f5d0dd74 100644 --- a/man/pascal_voc_datasets.Rd +++ b/man/pascal_voc_datasets.Rd @@ -130,6 +130,7 @@ tensor_image_browse(boxed_img) Other segmentation_dataset: \code{\link{oxfordiiitpet_segmentation_dataset}()}, \code{\link{rf100_peixos_segmentation_dataset}()} +\code{\link{vggface2_dataset}()} Other detection_dataset: \code{\link{coco_detection_dataset}()}, diff --git a/man/vggface2_dataset.Rd b/man/vggface2_dataset.Rd new file mode 100644 index 00000000..db558d87 --- /dev/null +++ b/man/vggface2_dataset.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-vggface2.R +\name{vggface2_dataset} +\alias{vggface2_dataset} +\title{VGGFace2 Dataset} +\usage{ +vggface2_dataset( + root = tempdir(), + train = TRUE, + transform = NULL, + target_transform = NULL, + download = FALSE +) +} +\arguments{ +\item{root}{Character. Root directory where the dataset will be stored under \code{root/vggface2}.} + +\item{train}{Logical. If TRUE, use the training set; otherwise, use the test set. Not applicable to all datasets.} + +\item{transform}{Optional. A function that takes an image and returns a transformed version (e.g., normalization, cropping).} + +\item{target_transform}{Optional. A function that transforms the label.} + +\item{download}{Logical. If TRUE, downloads the dataset to \verb{root/}. If the dataset is already present, download is skipped.} +} +\value{ +A torch dataset object \code{vggface2_dataset}: +\itemize{ +\item \code{x}: RGB image array. +\item \code{y}: Integer label (1…N) for the identity. +} + +\code{ds$classes} is a named list mapping integer labels to a list with: +\itemize{ +\item \code{name}: Character name of the person. +\item \code{gender}: "Male" or "Female". +} +} +\description{ +The VGGFace2 dataset is a large-scale face recognition dataset containing images +of celebrities from a wide range of ethnicities, professions, and ages. +Each identity has multiple images with large variations in pose, age, illumination, +ethnicity, and profession. +} +\examples{ +\dontrun{ +#Load the training set +ds <- vggface2_dataset(download = TRUE) +item <- ds[1] +item$x # image array RGB +item$y # integer label +ds$classes[item$y] # list(name=..., gender=...) + +#Load the test set +ds <- vggface2_dataset(download = TRUE, train = FALSE) +item <- ds[1] +item$x # image array RGB +item$y # integer label +ds$classes[item$y] # list(name=..., gender=...) +} + +} +\seealso{ +Other segmentation_dataset: +\code{\link{oxfordiiitpet_segmentation_dataset}()}, +\code{\link{pascal_voc_datasets}} +} +\concept{segmentation_dataset} diff --git a/tests/testthat/test-dataset-vggface2.R b/tests/testthat/test-dataset-vggface2.R new file mode 100644 index 00000000..b4b57ee3 --- /dev/null +++ b/tests/testthat/test-dataset-vggface2.R @@ -0,0 +1,32 @@ +context('dataset-vggface2') + +t <- withr::local_tempdir() +options(timeout = 60000) + +test_that("VGGFace2 dataset works correctly for train split", { + + skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, + "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") + + vgg <- vggface2_dataset(root = t, download = TRUE) + expect_length(vgg, 3141890) + first_item <- vgg[1] + expect_named(first_item, c("x", "y")) + expect_type(first_item$x, "double") + expect_type(first_item$y, "integer") + expect_equal(first_item$y, 1) +}) + +test_that("VGGFace2 dataset works correctly for test split", { + + skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, + "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") + + vgg <- vggface2_dataset(root = t, train = FALSE) + expect_length(vgg, 169396) + first_item <- vgg[1] + expect_named(first_item, c("x", "y")) + expect_type(first_item$x, "double") + expect_type(first_item$y, "integer") + expect_equal(first_item$y, 1) +}) \ No newline at end of file