Skip to content

Conversation

@ben-schwen
Copy link
Member

@ben-schwen ben-schwen commented Dec 16, 2025

Not directly demanded but improving benchmarks

image image
Details ```r library(atime) library(data.table)

pkg.path = '.'
limit = 5

taken from .ci/atime/tests.R

pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) {
pkg_find_replace = function(glob, FIND, REPLACE) {
atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
}
Package_regex = gsub(".", "?", old.Package, fixed = TRUE)
Package
= gsub(".", "", old.Package, fixed = TRUE)
new.Package
= paste0(Package_, "", sha)
pkg_find_replace(
"DESCRIPTION",
paste0("Package:\s+", old.Package),
paste("Package:", new.Package))
pkg_find_replace(
file.path("src", "Makevars.*in"),
Package_regex,
new.Package
)
pkg_find_replace(
file.path("R", "onLoad.R"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
sprintf('packageVersion\("%s"\)', old.Package),
sprintf('packageVersion\("%s"\)', new.Package))
pkg_find_replace(
file.path("src", "init.c"),
paste0("R_init_", Package_regex),
paste0("R_init_", gsub("[.]", "", new.Package)))
pkg_find_replace(
"NAMESPACE",
sprintf('useDynLib\("?%s"?', Package_regex),
paste0('useDynLib(', new.Package_))
}

versions = c(
'before' = 'b0c4ac3b',
'after' = 'df05b9d0'
)

set.seed(123)

===== Benchmark 1: Small groups (10 elements each) =====

N = as.integer(10^seq(2, 6, 0.25))
grpsize_small = 10

gmedian_work1 = lapply(setNames(nm = N), function(n) {
data.table(
grp = rep(1:n, each = grpsize_small),
x = rnorm(n * grpsize_small)
)
})

gmedian_bench1 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work1[[as.character(N)]]
},
expr = data.table:::[.data.table(dt, , .(med = median(x)), by = grp),
seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench1)

===== Benchmark 2: Medium groups (100 elements each) =====

N = as.integer(10^seq(2, 5, 0.25))
grpsize_medium = 100

gmedian_work2 = lapply(setNames(nm = N), function(n) {
data.table(
grp = rep(1:n, each = grpsize_medium),
x = rnorm(n * grpsize_medium)
)
})

gmedian_bench2 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work2[[as.character(N)]]
},
expr = data.table:::[.data.table(dt, , .(med = median(x)), by = grp),
seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench2)

===== Benchmark 3: Large groups (1000 elements each) =====

N = as.integer(10^seq(2, 4.5, 0.25))
grpsize_large = 1000

gmedian_work3 = lapply(setNames(nm = N), function(n) {
data.table(
grp = rep(1:n, each = grpsize_large),
x = rnorm(n * grpsize_large)
)
})

gmedian_bench3 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work3[[as.character(N)]]
},
expr = data.table:::[.data.table(dt, , .(med = median(x)), by = grp),
seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench3)

===== Benchmark 4: Very large groups (10000 elements each) =====

N = as.integer(10^seq(1, 3.5, 0.25))
grpsize_vlarge = 10000

gmedian_work4 = lapply(setNames(nm = N), function(n) {
data.table(
grp = rep(1:n, each = grpsize_vlarge),
x = rnorm(n * grpsize_vlarge)
)
})

gmedian_bench4 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work4[[as.character(N)]]
},
expr = data.table:::[.data.table(dt, , .(med = median(x)), by = grp),
seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench4)

===== Benchmark 5: Integer data =====

N = as.integer(10^seq(2, 4.5, 0.25))
grpsize_int = 1000

gmedian_work5 = lapply(setNames(nm = N), function(n) {
data.table(
grp = rep(1:n, each = grpsize_int),
x = sample(1L:1000L, n * grpsize_int, replace = TRUE)
)
})

gmedian_bench5 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work5[[as.character(N)]]
},
expr = data.table:::[.data.table(dt, , .(med = median(x)), by = grp),
seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench5)

===== Benchmark 6: With NA values =====

N = as.integer(10^seq(2, 4.5, 0.25))
grpsize_na = 1000

gmedian_work6 = lapply(setNames(nm = N), function(n) {
dt = data.table(
grp = rep(1:n, each = grpsize_na),
x = rnorm(n * grpsize_na)
)

Set 10% of values to NA

dt[sample(.N, as.integer(.N * 0.1)), x := NA_real_]
dt
})

gmedian_bench6 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work6[[as.character(N)]]
},
expr = data.table:::[.data.table(dt, , .(med = median(x, na.rm = TRUE)), by = grp),
seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench6)

===== SAVE BENCHMARK RESULTS =====

results_dir = "benchmark_results"
dir.create(results_dir, showWarnings = FALSE)

save(
gmedian_bench1, gmedian_bench2, gmedian_bench3,
gmedian_bench4, gmedian_bench5, gmedian_bench6,
file = file.path(results_dir, "benchmarks_gmedian.rda")
)

library(ggplot2)
library(patchwork)

load(file.path(results_dir, "benchmarks_gmedian.rda"))

g = function(p, title) p + ggtitle(title)

p = list(
g(plot(gmedian_bench1), "Small groups (10 elements)"),
g(plot(gmedian_bench2), "Medium groups (100 elements)"),
g(plot(gmedian_bench3), "Large groups (1000 elements)"),
g(plot(gmedian_bench4), "Very large groups (10000 elements)"),
g(plot(gmedian_bench5), "Integer data (1000 elements)"),
g(plot(gmedian_bench6), "With 10% NA values (1000 elements)")
)

pdf("benchmark_gmedian_results.pdf", width = 12, height = 9)
for (i in seq(1, length(p), by = 4)) {
pg = wrap_plots(p[i:min(i+3, length(p))], ncol = 2, nrow = 2, guides = "collect")
print(pg)
}
dev.off()

</details>

@ben-schwen ben-schwen changed the title add floyd-rivest and parallelization to gmedian add floyd-rivest selection and parallelization to gmedian Dec 16, 2025
@codecov
Copy link

codecov bot commented Dec 16, 2025

Codecov Report

❌ Patch coverage is 33.33333% with 6 lines in your changes missing coverage. Please review.
✅ Project coverage is 99.03%. Comparing base (b0c4ac3) to head (36742ea).
⚠️ Report is 22 commits behind head on master.

Files with missing lines Patch % Lines
src/quickselect.c 0.00% 6 Missing ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##           master    #7481      +/-   ##
==========================================
- Coverage   99.06%   99.03%   -0.04%     
==========================================
  Files          86       86              
  Lines       16618    16605      -13     
==========================================
- Hits        16463    16444      -19     
- Misses        155      161       +6     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@github-actions
Copy link

github-actions bot commented Dec 16, 2025

No obvious timing issues in HEAD=gmedian
Comparison Plot

Generated via commit 36742ea

Download link for the artifact containing the test results: ↓ atime-results.zip

Task Duration
R setup and installing dependencies 2 minutes and 57 seconds
Installing different package versions 22 seconds
Running and plotting the test cases 3 minutes and 0 seconds

Co-authored-by: Michael Chirico <chiricom@google.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants