From dc8d4ee4b25ae641358bf527b39034bca71ae5b4 Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Tue, 12 Aug 2025 23:10:21 +0530 Subject: [PATCH 01/14] Adding dataset vggface2 #224 --- R/dataset-vggface2.R | 199 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 R/dataset-vggface2.R diff --git a/R/dataset-vggface2.R b/R/dataset-vggface2.R new file mode 100644 index 00000000..ba87b362 --- /dev/null +++ b/R/dataset-vggface2.R @@ -0,0 +1,199 @@ +#' VGGFace2 Dataset +#' +#' The VGGFace2 dataset is a large-scale face recognition dataset containing images +#' of celebrities from a wide range of ethnicities, professions, and ages. +#' Each identity has multiple images with large variations in pose, age, illumination, +#' ethnicity, and profession. +#' +#' @inheritParams oxfordiiitpet_dataset +#' @param root Character. Root directory where the dataset will be stored under `root/vggface2`. +#' +#' @return A torch dataset object `vggface2_dataset`: +#' - `x`: RGB image array. +#' - `y`: Integer label (1…N) for the identity. +#' +#' `ds$classes` is a named list mapping integer labels to a list with: +#' - `name`: Character name of the person. +#' - `gender`: "Male" or "Female". +#' +#' @examples +#' \dontrun{ +#' #Load the training set +#' ds <- vggface2_dataset(download = TRUE) +#' item <- ds[1] +#' item$x # image tensor +#' item$y # integer label +#' ds$classes[item$y] # list(name=..., gender=...) +#' +#' #Load the test set +#' ds <- vggface2_dataset(download = TRUE, train = FALSE) +#' item <- ds[1] +#' item$x # image tensor +#' item$y # integer label +#' ds$classes[item$y] # list(name=..., gender=...) +#' } +#' +#' @family segmentation_dataset +#' @export +vggface2_dataset <- torch::dataset( + name = "vggface2", + resources = list( + train_images = "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_train.tar.gz", + test_images = "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_test.tar.gz", + train_list = "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/train_list.txt", + test_list = "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/test_list.txt", + identity = "https://huggingface.co/datasets/ProgramComputer/VGGFace2/raw/main/meta/identity_meta.csv" + ), + archive_size = "38 GB", + training_file = "train.rds", + test_file = "test.rds", + + initialize = function( + root = tempdir(), + train = TRUE, + transform = NULL, + target_transform = NULL, + download = FALSE + ) { + self$root_path <- root + self$train <- train + self$transform <- transform + self$target_transform <- target_transform + if (train) { + self$split <- "train" + self$archive_size <- "36 GB" + } else { + self$split <- "test" + self$archive_size <- "2 GB" + } + + if (download) { + cli_inform("Dataset {.cls {class(self)[[1]]}} (~{.emph {self$archive_size}}) will be downloaded and processed if not already available.") + self$download() + } + + if (!self$check_exists()) { + cli_abort("Dataset not found. You can use `download = TRUE` to download it.") + } + + if (train) { + data_file <- self$training_file + } else { + data_file <- self$test_file + } + data <- readRDS(file.path(self$processed_folder, data_file)) + + self$img_path <- data$img_path + self$labels <- data$labels + self$classes <- data$classes + + cli_inform("{.cls {class(self)[[1]]}} dataset loaded with {self$.length()} images across {length(self$classes)} classes.") + }, + + download = function() { + if (self$check_exists()) { + return() + } + + fs::dir_create(self$raw_folder) + fs::dir_create(self$processed_folder) + + cli_inform("Downloading {.cls {class(self)[[1]]}}...") + + download_and_extract <- function(url, exdir) { + archive <- download_and_cache(url, prefix = class(self)[1]) + utils::untar(archive, exdir = exdir) + } + + download_and_extract(self$resources$train_images, self$raw_folder) + download_and_extract(self$resources$test_images, self$raw_folder) + + train_list_file <- file.path(self$raw_folder, "train_list.txt") + download.file(self$resources$train_list, train_list_file, mode = "wb") + test_list_file <- file.path(self$raw_folder, "test_list.txt") + download.file(self$resources$test_list, test_list_file, mode = "wb") + + identity_file <- file.path(self$raw_folder, "identity_meta.csv") + download.file(self$resources$identity, identity_file, mode = "wb") + + identity_df <- read.csv(identity_file, sep = ",", stringsAsFactors = FALSE, strip.white = TRUE) + identity_df$Class_ID <- trimws(identity_df$Class_ID) + identity_map <- setNames( + lapply(seq_len(nrow(identity_df)), function(i) { + if(identity_df$Gender[i] == 'f'){ + gender <- "Female" + }else{ + gender <- "Male" + } + list( + name = identity_df$Name[i], + gender = gender + ) + }), + identity_df$Class_ID + ) + + for (split in c("train", "test")) { + if (split == "train") { + list_file <- train_list_file + } else { + list_file <- test_list_file + } + files <- readLines(list_file) + + img_path <- file.path(self$raw_folder, split, files) + class_ids <- sub("/.*$", "", files) + unique_ids <- unique(class_ids) + + class_to_idx <- setNames(seq_along(unique_ids), unique_ids) + + labels <- as.integer(class_to_idx[class_ids]) + + classes_list <- lapply(unique_ids, function(cid) { + identity_map[[cid]] + }) + + saveRDS( + list( + img_path = img_path, + labels = labels, + classes = classes_list + ), + file.path(self$processed_folder, paste0(split, ".rds")) + ) + } + + cli_inform("Dataset {.cls {class(self)[[1]]}} downloaded and extracted successfully.") + }, + + check_exists = function() { + fs::file_exists(file.path(self$processed_folder, self$training_file)) && + fs::file_exists(file.path(self$processed_folder, self$test_file)) + }, + + .getitem = function(index) { + x <- jpeg::readJPEG(self$img_path[index]) + y <- self$labels[index] + + if (!is.null(self$transform)) { + x <- self$transform(x) + } + if (!is.null(self$target_transform)) { + y <- self$target_transform(y) + } + list(x = x, y = y) + }, + + .length = function() { + length(self$img_path) + }, + + active = list( + raw_folder = function() { + file.path(self$root_path, "vggface2", "raw") + }, + processed_folder = function() { + file.path(self$root_path, "vggface2", "processed") + } + ) +) \ No newline at end of file From 20c5a49fe6806b3f093be26319555e7e82a5bbcd Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Tue, 12 Aug 2025 23:10:35 +0530 Subject: [PATCH 02/14] Adding NAMESPACE #224 --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 96c2221d..1adab866 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -187,6 +187,7 @@ export(transform_rotate) export(transform_ten_crop) export(transform_to_tensor) export(transform_vflip) +export(vggface2_dataset) export(vision_make_grid) export(whoi_plankton_dataset) export(whoi_small_plankton_dataset) From 7d0215303b7be60c9aff394a5d8e67796ccddef9 Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Tue, 12 Aug 2025 23:11:12 +0530 Subject: [PATCH 03/14] Adding documentation for vggface2 #224 --- man/oxfordiiitpet_segmentation_dataset.Rd | 3 +- man/pascal_voc_datasets.Rd | 3 +- man/vggface2_dataset.Rd | 68 +++++++++++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 man/vggface2_dataset.Rd diff --git a/man/oxfordiiitpet_segmentation_dataset.Rd b/man/oxfordiiitpet_segmentation_dataset.Rd index e6de3a8f..aa41b913 100644 --- a/man/oxfordiiitpet_segmentation_dataset.Rd +++ b/man/oxfordiiitpet_segmentation_dataset.Rd @@ -67,6 +67,7 @@ tensor_image_browse(overlay) } \seealso{ Other segmentation_dataset: -\code{\link{pascal_voc_datasets}} +\code{\link{pascal_voc_datasets}}, +\code{\link{vggface2_dataset}()} } \concept{segmentation_dataset} diff --git a/man/pascal_voc_datasets.Rd b/man/pascal_voc_datasets.Rd index 18773c2d..b200c8be 100644 --- a/man/pascal_voc_datasets.Rd +++ b/man/pascal_voc_datasets.Rd @@ -128,7 +128,8 @@ tensor_image_browse(boxed_img) } \seealso{ Other segmentation_dataset: -\code{\link{oxfordiiitpet_segmentation_dataset}()} +\code{\link{oxfordiiitpet_segmentation_dataset}()}, +\code{\link{vggface2_dataset}()} Other detection_dataset: \code{\link{coco_detection_dataset}()} diff --git a/man/vggface2_dataset.Rd b/man/vggface2_dataset.Rd new file mode 100644 index 00000000..2d7319f6 --- /dev/null +++ b/man/vggface2_dataset.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-vggface2.R +\name{vggface2_dataset} +\alias{vggface2_dataset} +\title{VGGFace2 Dataset} +\usage{ +vggface2_dataset( + root = tempdir(), + train = TRUE, + transform = NULL, + target_transform = NULL, + download = FALSE +) +} +\arguments{ +\item{root}{Character. Root directory where the dataset will be stored under \code{root/vggface2}.} + +\item{train}{Logical. If TRUE, use the training set; otherwise, use the test set. Not applicable to all datasets.} + +\item{transform}{Optional. A function that takes an image and returns a transformed version (e.g., normalization, cropping).} + +\item{target_transform}{Optional. A function that transforms the label.} + +\item{download}{Logical. If TRUE, downloads the dataset to \verb{root/}. If the dataset is already present, download is skipped.} +} +\value{ +A torch dataset object \code{vggface2_dataset}: +\itemize{ +\item \code{x}: RGB image array. +\item \code{y}: Integer label (1…N) for the identity. +} + +\code{ds$classes} is a named list mapping integer labels to a list with: +\itemize{ +\item \code{name}: Character name of the person. +\item \code{gender}: "Male" or "Female". +} +} +\description{ +The VGGFace2 dataset is a large-scale face recognition dataset containing images +of celebrities from a wide range of ethnicities, professions, and ages. +Each identity has multiple images with large variations in pose, age, illumination, +ethnicity, and profession. +} +\examples{ +\dontrun{ +#Load the training set +ds <- vggface2_dataset(download = TRUE) +item <- ds[1] +item$x # image tensor +item$y # integer label +ds$classes[item$y] # list(name=..., gender=...) + +#Load the test set +ds <- vggface2_dataset(download = TRUE, train = FALSE) +item <- ds[1] +item$x # image tensor +item$y # integer label +ds$classes[item$y] # list(name=..., gender=...) +} + +} +\seealso{ +Other segmentation_dataset: +\code{\link{oxfordiiitpet_segmentation_dataset}()}, +\code{\link{pascal_voc_datasets}} +} +\concept{segmentation_dataset} From 9d52d80383f3e79a59e2e41e5c1dc3f2e9c77c7c Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Wed, 13 Aug 2025 00:23:33 +0530 Subject: [PATCH 04/14] Adding tests for vggface2 #224 --- tests/testthat/test-dataset-vggface2.R | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 tests/testthat/test-dataset-vggface2.R diff --git a/tests/testthat/test-dataset-vggface2.R b/tests/testthat/test-dataset-vggface2.R new file mode 100644 index 00000000..f6e7a40f --- /dev/null +++ b/tests/testthat/test-dataset-vggface2.R @@ -0,0 +1,25 @@ +context('dataset-vggface2') + +t <- withr::local_tempdir() + +test_that("VGGFace2 dataset works correctly for train split", { + + vgg <- vggface2_dataset(root = t, download = TRUE) + expect_length(vgg, 3141890) + first_item <- vgg[1] + expect_named(first_item, c("x", "y")) + expect_type(first_item$x, "double") + expect_type(first_item$y, "integer") + expect_equal(first_item$y, 1) +}) + +test_that("VGGFace2 dataset works correctly for test split", { + + vgg <- vggface2_dataset(root = t, train = FALSE) + expect_length(vgg, 169396) + first_item <- vgg[1] + expect_named(first_item, c("x", "y")) + expect_type(first_item$x, "double") + expect_type(first_item$y, "integer") + expect_equal(first_item$y, 1) +}) \ No newline at end of file From 954b0c4a82c1f00eb4154c39af51a9b49f40781c Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Wed, 13 Aug 2025 00:40:12 +0530 Subject: [PATCH 05/14] Adding skips to tests, tests sucessful on local #224 --- tests/testthat/test-dataset-vggface2.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/testthat/test-dataset-vggface2.R b/tests/testthat/test-dataset-vggface2.R index f6e7a40f..914de1df 100644 --- a/tests/testthat/test-dataset-vggface2.R +++ b/tests/testthat/test-dataset-vggface2.R @@ -4,6 +4,9 @@ t <- withr::local_tempdir() test_that("VGGFace2 dataset works correctly for train split", { + skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, + "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") + vgg <- vggface2_dataset(root = t, download = TRUE) expect_length(vgg, 3141890) first_item <- vgg[1] @@ -15,6 +18,9 @@ test_that("VGGFace2 dataset works correctly for train split", { test_that("VGGFace2 dataset works correctly for test split", { + skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, + "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") + vgg <- vggface2_dataset(root = t, train = FALSE) expect_length(vgg, 169396) first_item <- vgg[1] From 61672116fdc45e35080f8ea953c79f025e01086c Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Wed, 13 Aug 2025 00:47:20 +0530 Subject: [PATCH 06/14] Testing on CI #224 --- tests/testthat/test-dataset-vggface2.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-dataset-vggface2.R b/tests/testthat/test-dataset-vggface2.R index 914de1df..6ea7df01 100644 --- a/tests/testthat/test-dataset-vggface2.R +++ b/tests/testthat/test-dataset-vggface2.R @@ -1,11 +1,12 @@ context('dataset-vggface2') t <- withr::local_tempdir() +options(timeout = 60000) test_that("VGGFace2 dataset works correctly for train split", { - skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, - "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") +# skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, +# "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") vgg <- vggface2_dataset(root = t, download = TRUE) expect_length(vgg, 3141890) @@ -18,8 +19,8 @@ test_that("VGGFace2 dataset works correctly for train split", { test_that("VGGFace2 dataset works correctly for test split", { - skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, - "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") +# skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, +# "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") vgg <- vggface2_dataset(root = t, train = FALSE) expect_length(vgg, 169396) From 14b4ac55cb8d8033d7e34ae9d04e7056bf853bf3 Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Wed, 13 Aug 2025 12:22:19 +0530 Subject: [PATCH 07/14] Fix Downloads for text lists #224 --- R/dataset-vggface2.R | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/R/dataset-vggface2.R b/R/dataset-vggface2.R index ba87b362..d6f586a9 100644 --- a/R/dataset-vggface2.R +++ b/R/dataset-vggface2.R @@ -108,13 +108,9 @@ vggface2_dataset <- torch::dataset( download_and_extract(self$resources$train_images, self$raw_folder) download_and_extract(self$resources$test_images, self$raw_folder) - train_list_file <- file.path(self$raw_folder, "train_list.txt") - download.file(self$resources$train_list, train_list_file, mode = "wb") - test_list_file <- file.path(self$raw_folder, "test_list.txt") - download.file(self$resources$test_list, test_list_file, mode = "wb") - - identity_file <- file.path(self$raw_folder, "identity_meta.csv") - download.file(self$resources$identity, identity_file, mode = "wb") + train_list_file <- download_and_cache(self$resources$train_list, prefix = "train_list") + test_list_file <- download_and_cache(self$resources$test_list, prefix = "test_list") + identity_file <- download_and_cache(self$resources$identity, prefix = "identity_meta") identity_df <- read.csv(identity_file, sep = ",", stringsAsFactors = FALSE, strip.white = TRUE) identity_df$Class_ID <- trimws(identity_df$Class_ID) From 8f68e69ed01989ba9471aee6926705c92c2d8e0b Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Wed, 13 Aug 2025 14:50:52 +0530 Subject: [PATCH 08/14] Fix downloads and add checksums #224 --- R/dataset-vggface2.R | 48 ++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/R/dataset-vggface2.R b/R/dataset-vggface2.R index d6f586a9..5659b75d 100644 --- a/R/dataset-vggface2.R +++ b/R/dataset-vggface2.R @@ -38,11 +38,11 @@ vggface2_dataset <- torch::dataset( name = "vggface2", resources = list( - train_images = "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_train.tar.gz", - test_images = "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_test.tar.gz", - train_list = "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/train_list.txt", - test_list = "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/test_list.txt", - identity = "https://huggingface.co/datasets/ProgramComputer/VGGFace2/raw/main/meta/identity_meta.csv" + train_images = c("https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_train.tar.gz","88813c6b15de58afc8fa75ea83361d7f"), + test_images = c("https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_test.tar.gz","bb7a323824d1004e14e00c23974facd3"), + train_list = c("https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/train_list.txt","4cfbab4a839163f454d7ecef28b68669"), + test_list = c("https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/test_list.txt","d08b10f12bc9889509364ef56d73c621"), + identity = c("https://huggingface.co/datasets/ProgramComputer/VGGFace2/raw/main/meta/identity_meta.csv","d315386c7e8e166c4f60e27d9cc61acc") ), archive_size = "38 GB", training_file = "train.rds", @@ -100,18 +100,36 @@ vggface2_dataset <- torch::dataset( cli_inform("Downloading {.cls {class(self)[[1]]}}...") - download_and_extract <- function(url, exdir) { - archive <- download_and_cache(url, prefix = class(self)[1]) - utils::untar(archive, exdir = exdir) + archive <- download_and_cache(self$resources$train_images[1], prefix = class(self)[1]) + if (tools::md5sum(archive) != self$resources$train_images[2]) { + runtime_error("Corrupt file! Delete the file in {archive} and try again.") } + utils::untar(archive, exdir = self$raw_folder) - download_and_extract(self$resources$train_images, self$raw_folder) - download_and_extract(self$resources$test_images, self$raw_folder) + archive <- download_and_cache(self$resources$test_images[1], prefix = class(self)[1]) + if (tools::md5sum(archive) != self$resources$test_images[2]) { + runtime_error("Corrupt file! Delete the file in {archive} and try again.") + } + utils::untar(archive, exdir = self$raw_folder) - train_list_file <- download_and_cache(self$resources$train_list, prefix = "train_list") - test_list_file <- download_and_cache(self$resources$test_list, prefix = "test_list") - identity_file <- download_and_cache(self$resources$identity, prefix = "identity_meta") + archive <- download_and_cache(self$resources$train_list[1], prefix = "train_list") + if (tools::md5sum(archive) != self$resources$train_list[2]) { + runtime_error("Corrupt file! Delete the file in {archive} and try again.") + } + fs::file_move(archive, self$raw_folder) + archive <- download_and_cache(self$resources$test_list[1], prefix = "test_list") + if (tools::md5sum(archive) != self$resources$test_list[2]) { + runtime_error("Corrupt file! Delete the file in {archive} and try again.") + } + fs::file_move(archive, self$raw_folder) + + archive <- download_and_cache(self$resources$identity[1], prefix = "identity_meta") + if (tools::md5sum(archive) != self$resources$identity[2]) { + runtime_error("Corrupt file! Delete the file in {archive} and try again.") + } + fs::file_move(archive, self$raw_folder) + identity_file <- file.path(self$raw_folder, "identity_meta.csv") identity_df <- read.csv(identity_file, sep = ",", stringsAsFactors = FALSE, strip.white = TRUE) identity_df$Class_ID <- trimws(identity_df$Class_ID) identity_map <- setNames( @@ -131,9 +149,9 @@ vggface2_dataset <- torch::dataset( for (split in c("train", "test")) { if (split == "train") { - list_file <- train_list_file + list_file <- file.path(self$raw_folder, "train_list.txt") } else { - list_file <- test_list_file + list_file <- file.path(self$raw_folder, "test_list.txt") } files <- readLines(list_file) From 3d49e34688341c6aedcdcaaa2fb692d7c672797c Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Wed, 13 Aug 2025 17:39:22 +0530 Subject: [PATCH 09/14] Adding skips as dataset is too big to be tested on CI #224 --- tests/testthat/test-dataset-vggface2.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-dataset-vggface2.R b/tests/testthat/test-dataset-vggface2.R index 6ea7df01..b4b57ee3 100644 --- a/tests/testthat/test-dataset-vggface2.R +++ b/tests/testthat/test-dataset-vggface2.R @@ -5,8 +5,8 @@ options(timeout = 60000) test_that("VGGFace2 dataset works correctly for train split", { -# skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, -# "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") + skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, + "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") vgg <- vggface2_dataset(root = t, download = TRUE) expect_length(vgg, 3141890) @@ -19,8 +19,8 @@ test_that("VGGFace2 dataset works correctly for train split", { test_that("VGGFace2 dataset works correctly for test split", { -# skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, -# "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") + skip_if(Sys.getenv("TEST_LARGE_DATASETS", unset = 0) != 1, + "Skipping test: set TEST_LARGE_DATASETS=1 to enable tests requiring large downloads.") vgg <- vggface2_dataset(root = t, train = FALSE) expect_length(vgg, 169396) From 5a7ce18fc618fbf643f5fa4a87dfe72f1199b3b0 Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Wed, 13 Aug 2025 18:54:48 +0530 Subject: [PATCH 10/14] Updating documentation --- R/dataset-vggface2.R | 4 ++-- man/vggface2_dataset.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/dataset-vggface2.R b/R/dataset-vggface2.R index 5659b75d..6bd084dd 100644 --- a/R/dataset-vggface2.R +++ b/R/dataset-vggface2.R @@ -21,14 +21,14 @@ #' #Load the training set #' ds <- vggface2_dataset(download = TRUE) #' item <- ds[1] -#' item$x # image tensor +#' item$x # image array RGB #' item$y # integer label #' ds$classes[item$y] # list(name=..., gender=...) #' #' #Load the test set #' ds <- vggface2_dataset(download = TRUE, train = FALSE) #' item <- ds[1] -#' item$x # image tensor +#' item$x # image array RGB #' item$y # integer label #' ds$classes[item$y] # list(name=..., gender=...) #' } diff --git a/man/vggface2_dataset.Rd b/man/vggface2_dataset.Rd index 2d7319f6..db558d87 100644 --- a/man/vggface2_dataset.Rd +++ b/man/vggface2_dataset.Rd @@ -47,14 +47,14 @@ ethnicity, and profession. #Load the training set ds <- vggface2_dataset(download = TRUE) item <- ds[1] -item$x # image tensor +item$x # image array RGB item$y # integer label ds$classes[item$y] # list(name=..., gender=...) #Load the test set ds <- vggface2_dataset(download = TRUE, train = FALSE) item <- ds[1] -item$x # image tensor +item$x # image array RGB item$y # integer label ds$classes[item$y] # list(name=..., gender=...) } From 38159bcd280e61d6c78c3df83178159b40813ba5 Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Tue, 26 Aug 2025 19:01:35 +0530 Subject: [PATCH 11/14] Added NEWS.md entry --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 17bb22d9..42f145d7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,7 @@ * Added `places365_dataset()`for loading the Places365 dataset (@koshtiakanksha, #196). * Added `pascal_segmentation_dataset()`, and `pascal_detection_dataset()` for loading the Pascal Visual Object Classes datasets (@DerrickUnleashed, #209). * Added `whoi_plankton_dataset()`, and `whoi_small_plankton_dataset()` (@cregouby, #236). +* Added `vggface2_dataset()` for loading the VGGFace2 dataset (@DerrickUnleashed, #238). ## New models From ef246fd841e38021e58628684c7b4afada87e004 Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Sun, 14 Sep 2025 22:59:25 +0530 Subject: [PATCH 12/14] Switching to dataframes for resources lists --- R/dataset-vggface2.R | 64 +++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/R/dataset-vggface2.R b/R/dataset-vggface2.R index 6bd084dd..6221917d 100644 --- a/R/dataset-vggface2.R +++ b/R/dataset-vggface2.R @@ -37,14 +37,23 @@ #' @export vggface2_dataset <- torch::dataset( name = "vggface2", - resources = list( - train_images = c("https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_train.tar.gz","88813c6b15de58afc8fa75ea83361d7f"), - test_images = c("https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_test.tar.gz","bb7a323824d1004e14e00c23974facd3"), - train_list = c("https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/train_list.txt","4cfbab4a839163f454d7ecef28b68669"), - test_list = c("https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/test_list.txt","d08b10f12bc9889509364ef56d73c621"), - identity = c("https://huggingface.co/datasets/ProgramComputer/VGGFace2/raw/main/meta/identity_meta.csv","d315386c7e8e166c4f60e27d9cc61acc") + resources = data.frame( + split = c("train_images", "test_images", "train_list", "test_list", "identity"), + url = c( + "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_train.tar.gz", + "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/data/vggface2_test.tar.gz", + "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/train_list.txt", + "https://huggingface.co/datasets/ProgramComputer/VGGFace2/resolve/main/meta/test_list.txt", + "https://huggingface.co/datasets/ProgramComputer/VGGFace2/raw/main/meta/identity_meta.csv" + ), + md5 = c( + "88813c6b15de58afc8fa75ea83361d7f", + "bb7a323824d1004e14e00c23974facd3", + "4cfbab4a839163f454d7ecef28b68669", + "d08b10f12bc9889509364ef56d73c621", + "d315386c7e8e166c4f60e27d9cc61acc" + ) ), - archive_size = "38 GB", training_file = "train.rds", test_file = "test.rds", @@ -56,7 +65,6 @@ vggface2_dataset <- torch::dataset( download = FALSE ) { self$root_path <- root - self$train <- train self$transform <- transform self$target_transform <- target_transform if (train) { @@ -100,35 +108,19 @@ vggface2_dataset <- torch::dataset( cli_inform("Downloading {.cls {class(self)[[1]]}}...") - archive <- download_and_cache(self$resources$train_images[1], prefix = class(self)[1]) - if (tools::md5sum(archive) != self$resources$train_images[2]) { - runtime_error("Corrupt file! Delete the file in {archive} and try again.") - } - utils::untar(archive, exdir = self$raw_folder) - - archive <- download_and_cache(self$resources$test_images[1], prefix = class(self)[1]) - if (tools::md5sum(archive) != self$resources$test_images[2]) { - runtime_error("Corrupt file! Delete the file in {archive} and try again.") - } - utils::untar(archive, exdir = self$raw_folder) - - archive <- download_and_cache(self$resources$train_list[1], prefix = "train_list") - if (tools::md5sum(archive) != self$resources$train_list[2]) { - runtime_error("Corrupt file! Delete the file in {archive} and try again.") - } - fs::file_move(archive, self$raw_folder) - - archive <- download_and_cache(self$resources$test_list[1], prefix = "test_list") - if (tools::md5sum(archive) != self$resources$test_list[2]) { - runtime_error("Corrupt file! Delete the file in {archive} and try again.") - } - fs::file_move(archive, self$raw_folder) - - archive <- download_and_cache(self$resources$identity[1], prefix = "identity_meta") - if (tools::md5sum(archive) != self$resources$identity[2]) { - runtime_error("Corrupt file! Delete the file in {archive} and try again.") + for (i in seq_len(nrow(self$resources))) { + row <- self$resources[i, ] + archive <- download_and_cache(row$url, prefix = row$split) + if (tools::md5sum(archive) != row$md5) { + runtime_error("Corrupt file! Delete the file in {archive} and try again.") + } + if (tools::file_ext(row$url) == "gz") { + utils::untar(archive, exdir = self$raw_folder) + } else { + fs::file_move(archive, self$raw_folder) + } } - fs::file_move(archive, self$raw_folder) + identity_file <- file.path(self$raw_folder, "identity_meta.csv") identity_df <- read.csv(identity_file, sep = ",", stringsAsFactors = FALSE, strip.white = TRUE) identity_df$Class_ID <- trimws(identity_df$Class_ID) From 3a30eb0a500e33fb93abf3725e2647b7bbaa61fb Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Sun, 14 Sep 2025 23:26:27 +0530 Subject: [PATCH 13/14] Updating DESCRIPTION --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 47fc9ec7..5fe92f4c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -80,6 +80,7 @@ Collate: 'dataset-places365.R' 'dataset-plankton.R' 'dataset-rf100-peixos.R' + 'dataset-vggface2.R' 'extension.R' 'globals.R' 'imagenet.R' From 0527fe5090b52f298637635b82d6aedcdd2ab2f2 Mon Sep 17 00:00:00 2001 From: DerrickUnleashed Date: Mon, 15 Sep 2025 01:08:43 +0530 Subject: [PATCH 14/14] Using dataframes for storage instead of lists #224 --- R/dataset-vggface2.R | 64 +++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 42 deletions(-) diff --git a/R/dataset-vggface2.R b/R/dataset-vggface2.R index 6221917d..7019989b 100644 --- a/R/dataset-vggface2.R +++ b/R/dataset-vggface2.R @@ -124,51 +124,31 @@ vggface2_dataset <- torch::dataset( identity_file <- file.path(self$raw_folder, "identity_meta.csv") identity_df <- read.csv(identity_file, sep = ",", stringsAsFactors = FALSE, strip.white = TRUE) identity_df$Class_ID <- trimws(identity_df$Class_ID) - identity_map <- setNames( - lapply(seq_len(nrow(identity_df)), function(i) { - if(identity_df$Gender[i] == 'f'){ - gender <- "Female" - }else{ - gender <- "Male" - } - list( - name = identity_df$Name[i], - gender = gender - ) - }), - identity_df$Class_ID - ) + identity_df$Gender <- factor(identity_df$Gender, levels = c("f", "m"), labels = c("Female", "Male")) for (split in c("train", "test")) { - if (split == "train") { - list_file <- file.path(self$raw_folder, "train_list.txt") - } else { - list_file <- file.path(self$raw_folder, "test_list.txt") - } - files <- readLines(list_file) - - img_path <- file.path(self$raw_folder, split, files) - class_ids <- sub("/.*$", "", files) - unique_ids <- unique(class_ids) - - class_to_idx <- setNames(seq_along(unique_ids), unique_ids) - - labels <- as.integer(class_to_idx[class_ids]) - - classes_list <- lapply(unique_ids, function(cid) { - identity_map[[cid]] - }) - - saveRDS( - list( - img_path = img_path, - labels = labels, - classes = classes_list - ), - file.path(self$processed_folder, paste0(split, ".rds")) - ) - } + if (split == "train") { + list_file <- file.path(self$raw_folder, "train_list.txt") + } else { + list_file <- file.path(self$raw_folder, "test_list.txt") + } + split_df <- read.delim( + list_file, + sep = "/", + col.names = c("Class_ID", "img_path"), + header = FALSE, + stringsAsFactors = FALSE + ) + + merged_df <- merge(split_df, identity_df, by = "Class_ID", all.x = TRUE) + merged_df$Label <- as.integer(factor(merged_df$Class_ID, levels = unique(merged_df$Class_ID))) + + saveRDS( + merged_df, + file.path(self$processed_folder, paste0(split, ".rds")) + ) + } cli_inform("Dataset {.cls {class(self)[[1]]}} downloaded and extracted successfully.") },