Tmonster · Tmonster · Feb 13, 2025 · Sep 4, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
@@ -18,7 +18,7 @@ jobs:
     fail-fast: false
     matrix:
       solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, datafusion, dask, clickhouse]
-  name: Regression Tests solo solutions
+  name: Solo solutions
   runs-on: ubuntu-20.04
   env:
     CC: gcc-10
@@ -36,7 +36,7 @@ jobs:
 
     - name: Install libraries
       shell: bash
-      run: ./_utils/setup-small.sh
+      run: ./_setup_utils/setup_small.sh
 
     - name: Generate 500mb datasets
       shell: bash
@@ -50,7 +50,7 @@ jobs:
       shell: bash
       run: source path.env && python3 _setup_utils/install_all_solutions.py ${{ matrix.solution }}
 
-    - name: Turn swap off
+    - name: Turn swap off 
       shell: bash
       run: sudo swapoff -a
 
@@ -61,23 +61,32 @@ jobs:
       shell: bash
       if: ${{ matrix.solution == 'clickhouse' || matrix.solution == 'all' }}
       run: |
-        python3 _utils/prep_solutions.py --task=groupby --solution=clickhouse
+        python3 _setup_utils/prep_solutions.py --task=groupby --solution=clickhouse
         source path.env
-        TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        sleep 60
+        MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        sleep 60
 
     - name: Run mini GroupBy benchmark
       shell: bash
       run: |
-        python3 _utils/prep_solutions.py --task=groupby --solution=${{ matrix.solution }}
+        python3 _setup_utils/prep_solutions.py --task=groupby --solution=${{ matrix.solution }}
         source path.env
-        TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        sleep 60
+        MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        sleep 60
 
     - name: Run mini Join benchmark
       shell: bash
       run: |
-        python3 _utils/prep_solutions.py --task=join --solution=${{ matrix.solution }}
+        python3 _setup_utils/prep_solutions.py --task=join --solution=${{ matrix.solution }}
         source path.env
-        TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        sleep 60
+        MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        sleep 60
 
     - name: Validate benchmark results and report generation
       shell: bash
@@ -123,7 +132,7 @@ jobs:
 
     - name: Install libraries
       shell: bash
-      run: ./_utils/setup-small.sh
+      run: ./_setup_utils/setup_small.sh
 
     - name: Generate 500mb datasets
       shell: bash
@@ -144,16 +153,20 @@ jobs:
     - name: Run mini GroupBy benchmark
       shell: bash
       run: |
-        python3 _utils/prep_solutions.py --task=groupby --solution=all
+        python3 _setup_utils/prep_solutions.py --task=groupby --solution=all
         source path.env
-        TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        sleep 60
+        MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
 
     - name: Run mini Join benchmark
       shell: bash
       run: |
-        python3 _utils/prep_solutions.py --task=join --solution=all
+        python3 _setup_utils/prep_solutions.py --task=join --solution=all
         source path.env
-        TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
+        sleep 60
+        MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
 
     - name: Validate benchmark results and report generation
       shell: bash

diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@ metastore_db/*
 *.csv
 !time.csv
 !logs.csv
+!_control/data_small.csv
+!_control/data_large.csv
 *.md5
 .Rproj.user
 .Rhistory

diff --git a/R-arrow/groupby-R-arrow.R b/R-arrow/groupby-R-arrow.R
diff --git a/R-arrow/join-R-arrow.R b/R-arrow/join-R-arrow.R
@@ -17,6 +17,7 @@ cache = TRUE
 on_disk = FALSE
 
 data_name = Sys.getenv("SRC_DATANAME")
+machine_type = Sys.getenv("MACHINE_TYPE")
 src_jn_x = file.path("data", paste(data_name, "csv", sep="."))
 y_data_name = join_to_tbls(data_name)
 src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name))
@@ -46,15 +47,15 @@ t = system.time({
 })[["elapsed"]]
 m = memory_usage()
 chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
-write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
 rm(ans)
 t = system.time({
   ans<-collect(inner_join(x, small, by="id1"))
   print(dim(ans))
 })[["elapsed"]]
 m = memory_usage()
 chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
-write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
 ans <- collect(ans)
 print(head(ans, 3))
 print(tail(ans, 3))
@@ -68,15 +69,15 @@ t = system.time({
 })[["elapsed"]]
 m = memory_usage()
 chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
-write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
 rm(ans)
 t = system.time({
   ans<-collect(inner_join(x, medium, by="id2"))
   print(dim(ans))
 })[["elapsed"]]
 m = memory_usage()
 chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
-write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
 ans <- collect(ans)
 print(head(ans, 3))
 print(tail(ans, 3))
@@ -90,15 +91,15 @@ t = system.time({
 })[["elapsed"]]
 m = memory_usage()
 chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
-write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
 rm(ans)
 t = system.time({
   ans<-collect(left_join(x, medium, by="id2"))
   print(dim(ans))
 })[["elapsed"]]
 m = memory_usage()
 chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
-write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
 ans <- collect(ans)
 print(head(ans, 3))
 print(tail(ans, 3))
@@ -112,15 +113,15 @@ t = system.time({
 })[["elapsed"]]
 m = memory_usage()
 chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
-write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
 rm(ans)
 t = system.time({
   ans <- collect(inner_join(x, medium, by="id5"))
   print(dim(ans))
 })[["elapsed"]]
 m = memory_usage()
 chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
-write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
 ans <- collect(ans)
 print(head(ans, 3))
 print(tail(ans, 3))
@@ -134,15 +135,15 @@ t = system.time({
 })[["elapsed"]]
 m = memory_usage()
 chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
-write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
 rm(ans)
 t = system.time({
   ans<-collect(inner_join(x, big, by="id3"))
   print(dim(ans))
 })[["elapsed"]]
 m = memory_usage()
 chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
-write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
 ans <- collect(ans)
 print(head(ans, 3))
 print(tail(ans, 3))

diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R
@@ -267,10 +267,12 @@ groupby.syntax.dict = {list(
 )}
 groupby.data.exceptions = {list(                                                             # exceptions as of run 1575727624
   "collapse" = {list(
+    "Not Tested" = c("G1_1e9_1e2_0_0")
   )},
   "data.table" = {list(
     "timeout" = c("G1_1e9_1e1_0_0",                                                          # not always happened, q8 probably #110
-                  "G1_1e9_2e0_0_0")                                                          # q4 #110 also sometimes segfaults during fread but not easily reproducible
+                  "G1_1e9_2e0_0_0"),
+    "Not Tested" = c("G1_1e9_1e2_0_0")                                                       # q4 #110 also sometimes segfaults during fread but not easily reproducible
   )},
   "dplyr" = {list(
     "timeout" = c("G1_1e8_2e0_0_0"),                                                         # q10
@@ -285,7 +287,8 @@ groupby.data.exceptions = {list(
     "csv reader NAs bug: datatable#2808" = c("G1_1e9_1e2_5_0")
   )},
   "spark" = {list(
-    "timeout" = "G1_1e9_1e2_5_0" ## seems that both runs have finished but second run timing was not logged to time.csv due to timeout
+    "timeout" = "G1_1e9_1e2_5_0",  ## seems that both runs have finished but second run timing was not logged to time.csv due to timeout
+    "Not Tested" = c("G1_1e9_1e2_0_0")
   )},
   "dask" = {list(
     "not yet implemented: dask#6986" = c("G1_1e7_1e2_5_0","G1_1e8_1e2_5_0","G1_1e9_1e2_5_0"), # #171
@@ -307,9 +310,11 @@ groupby.data.exceptions = {list(
     "CSV import Segfault: JuliaLang#55765" = c("G1_1e7_1e2_0_0","G1_1e7_1e1_0_0","G1_1e7_2e0_0_0","G1_1e7_1e2_0_1","G1_1e7_1e2_5_0","G1_1e8_1e2_0_0","G1_1e8_1e1_0_0","G1_1e8_2e0_0_0","G1_1e8_1e2_0_1","G1_1e8_1e2_5_0","G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0")
   )},
   "clickhouse" = {list(
+    "Out of Memory" = c("G1_1e9_1e2_0_0")
   )},
   "polars" = {list(
     # "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0") # q10
+    "Not Tested" = c("G1_1e9_1e2_0_0")
   )},
   "R-arrow" = {list(
     # "timeout" = c(), # q10
@@ -325,7 +330,9 @@ groupby.data.exceptions = {list(
     # "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0"),
     # "incorrect: duckdb#1737" = c("G1_1e7_1e2_5_0","G1_1e8_1e2_5_0")
   )},
-  "datafusion" = {list()}
+  "datafusion" = {list(
+    "Not Tested" = c("G1_1e9_1e2_0_0")
+  )}
 )}
 groupby.exceptions = task.exceptions(groupby.query.exceptions, groupby.data.exceptions)
 
@@ -463,6 +470,7 @@ join.query.exceptions = {list(
 )}
 join.data.exceptions = {list(                                                             # exceptions as of run 1575727624
   "collapse" = {list(
+    "Not tested" = c("J1_1e9_NA_0_0")
   )},
   "data.table" = {list(
     "timeout" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")                  # fread
@@ -478,7 +486,8 @@ join.data.exceptions = {list(
     "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_0_1")                                  # q5 out of memory due to a deep copy
   )},
   "spark" = {list(
-    "timeout" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")                        # q5 using new 8h timeout #126
+    # "timeout" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1"),                        # q5 using new 8h timeout #126
+    "Not tested" = c("J1_1e9_NA_0_0")
   )},
   "dask" = {list(
     "internal error: dask#7015" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1",      # dask/dask#7015
@@ -494,6 +503,7 @@ join.data.exceptions = {list(
     "CSV import Segfault: JuliaLang#55765" = c("J1_1e7_NA_0_0", "J1_1e7_NA_5_0", "J1_1e7_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1", "J1_1e9_NA_0_0")
   )},
   "clickhouse" = {list(
+    "Out of Memory" = c("J1_1e9_NA_0_0")
   )},
   "polars" = {list(
     "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
@@ -504,15 +514,17 @@ join.data.exceptions = {list(
   )},
   "duckdb" = {list(
     # "internal error: duckdb#1739" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1","J1_1e8_NA_0_0","J1_1e8_NA_5_0","J1_1e8_NA_0_1"),
-    "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")#,
+    # "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")#,
     #"incorrect: duckdb#1737" = c("J1_1e7_NA_5_0","J1_1e8_NA_5_0")
   )},
   "duckdb-latest" = {list(
     # "internal error: duckdb#1739" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1","J1_1e8_NA_0_0","J1_1e8_NA_5_0","J1_1e8_NA_0_1"),
-    "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")#,
+    # "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")#,
     #"incorrect: duckdb#1737" = c("J1_1e7_NA_5_0","J1_1e8_NA_5_0")
   )},
-  "datafusion" = {list()}
+  "datafusion" = {list(
+    "Not tested" = c("J1_1e9_NA_0_0")
+  )}
 )}
 join.exceptions = task.exceptions(join.query.exceptions, join.data.exceptions)
 

diff --git a/_benchplot/benchplot.R b/_benchplot/benchplot.R
@@ -1,8 +1,12 @@
 ## Based on Matt Dowle scripts from 2014
 ## https://github.com/h2oai/db-benchmark/commit/fce1b8c9177afb49471fcf483a438f619f1a992b
 ## Original grouping benchmark can be found in: https://github.com/Rdatatable/data.table/wiki/Benchmarks-:-Grouping
+suppressPackageStartupMessages(library(bit64))
+
+format_comma = function(x) {
+  format(as.integer64(x), big.mark=",")
+}
 
-format_comma = function(x) format(as.integer(x), big.mark=",")
 format_num = function(x, digits=3L) { # at least 3+1 chars on output, there is surely some setting to achieve that better with base R but it is not obvious to find that among all features there
   cx = sprintf("%0.2f", x)
   int = sapply(strsplit(cx, ".", fixed=TRUE), `[`, 1L)

diff --git a/_control/data_large.csv b/_control/data_large.csv
@@ -0,0 +1,7 @@
+task,data,nrow,k,na,sort,active
+groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1
+groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1
+groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1
+groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1
+groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1
+join,J1_1e9_NA_0_0,1e9,NA,0,0,1
diff --git a/_control/data_small.csv b/_control/data_small.csv
@@ -0,0 +1,17 @@
+task,data,nrow,k,na,sort,active
+groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1
+groupby,G1_1e7_1e1_0_0,1e7,1e1,0,0,1
+groupby,G1_1e7_2e0_0_0,1e7,2e0,0,0,1
+groupby,G1_1e7_1e2_0_1,1e7,1e2,0,1,1
+groupby,G1_1e7_1e2_5_0,1e7,1e2,5,0,1
+groupby,G1_1e8_1e2_0_0,1e8,1e2,0,0,1
+groupby,G1_1e8_1e1_0_0,1e8,1e1,0,0,1
+groupby,G1_1e8_2e0_0_0,1e8,2e0,0,0,1
+groupby,G1_1e8_1e2_0_1,1e8,1e2,0,1,1
+groupby,G1_1e8_1e2_5_0,1e8,1e2,5,0,1
+join,J1_1e7_NA_0_0,1e7,NA,0,0,1
+join,J1_1e7_NA_5_0,1e7,NA,5,0,1
+join,J1_1e7_NA_0_1,1e7,NA,0,1,1
+join,J1_1e8_NA_0_0,1e8,NA,0,0,1
+join,J1_1e8_NA_5_0,1e8,NA,5,0,1
+join,J1_1e8_NA_0_1,1e8,NA,0,1,1