From eb326b8ef66d0095c9139ea21c6aef93d3008285 Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Thu, 28 Sep 2023 11:57:36 +0200 Subject: [PATCH 01/25] change update scripts to use slightly different versions of duckdb --- duckdb-latest/upg-duckdb-latest.sh | 19 +++++++++++++++++++ duckdb/upg-duckdb.sh | 20 ++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 duckdb-latest/upg-duckdb-latest.sh diff --git a/duckdb-latest/upg-duckdb-latest.sh b/duckdb-latest/upg-duckdb-latest.sh new file mode 100644 index 00000000..8ca0b126 --- /dev/null +++ b/duckdb-latest/upg-duckdb-latest.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -e + +# upgrade all packages in duckdb library only if new arrow is out +echo 'upgrading duckdb-latest, installing 0.9.1' + +rm -rf ./duckdb-latest/r-duckdb-latest +mkdir -p ./duckdb-latest/r-duckdb-latest + + +cd duckdb-latest +git clone https://github.com/duckdb/duckdb-r +cd duckdb-r +git checkout v0.9.0 +cd .. +ncores=$(nproc --all) +MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb-latest" duckdb-r +rm -rf duckdb-r +cd .. diff --git a/duckdb/upg-duckdb.sh b/duckdb/upg-duckdb.sh index e0b8a3eb..d1d59f45 100755 --- a/duckdb/upg-duckdb.sh +++ b/duckdb/upg-duckdb.sh @@ -2,5 +2,21 @@ set -e # upgrade all packages in duckdb library only if new arrow is out -echo 'upgrading duckdb...' -Rscript -e 'ap=available.packages(repos="https://cloud.r-project.org/"); if (ap["duckdb","Version"]!=packageVersion("duckdb", lib.loc="./duckdb/r-duckdb")) update.packages(lib.loc="./duckdb/r-duckdb", ask=FALSE, checkBuilt=TRUE, quiet=TRUE, repos="https://cloud.r-project.org/")' +echo 'upgrading duckdb, installing 0.8.1' + +rm -rf ./duckdb/r-duckdb +mkdir -p ./duckdb/r-duckdb + + +cd duckdb +git clone https://github.com/duckdb/duckdb-r +cd duckdb-r +git checkout v0.8.1 +cd .. +ncores=$(nproc --all) +MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb" duckdb-r +rm -rf duckdb-r +cd .. + + +# Rscript -e 'ap=available.packages(repos="https://cloud.r-project.org/"); if (ap["duckdb","Version"]!=packageVersion("duckdb", lib.loc="./duckdb/r-duckdb")) update.packages(lib.loc="./duckdb/r-duckdb", ask=FALSE, checkBuilt=TRUE, quiet=TRUE, repos="https://cloud.r-project.org/")' From 10d590a7eb205a43289a0530a42065dd6f3a10e3 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 30 Nov 2023 16:14:56 +0100 Subject: [PATCH 02/25] add clickhouse to regression script --- .github/workflows/regression.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 53a7684e..f4ecf306 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion] + solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion, clickhouse] name: Regression Tests solo solutions runs-on: ubuntu-20.04 env: From f96d042b32045e6607a39cc7cc144b7cb34eeae7 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 30 Nov 2023 16:24:01 +0100 Subject: [PATCH 03/25] actually install clickhouse for regression test --- _utils/install_all_solutions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/_utils/install_all_solutions.py b/_utils/install_all_solutions.py index 58bd847b..f16aff95 100755 --- a/_utils/install_all_solutions.py +++ b/_utils/install_all_solutions.py @@ -26,8 +26,6 @@ def install_all_solutions(): with open(SOLUTIONS_FILENAME, newline="") as solutions_file: solutions = csv.DictReader(solutions_file, delimiter=',') for row in solutions: - if row['solution'] == "clickhouse": - continue elif row['solution'] == "data.table": install_solutions.add("datatable") else: From a2a9fa0d4fed71c38d5512e33d2d37751e226e5b Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 30 Nov 2023 16:25:09 +0100 Subject: [PATCH 04/25] remove clickhouse from skipped solutions --- _utils/prep_solutions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_utils/prep_solutions.py b/_utils/prep_solutions.py index 98f4ddfc..41637d62 100755 --- a/_utils/prep_solutions.py +++ b/_utils/prep_solutions.py @@ -5,7 +5,7 @@ SOLUTIONS_FILENAME = "_control/solutions.csv" RUN_CONF_FILENAME = "run.conf" -SKIPPED_SOLUTIONS = ["clickhouse"] +SKIPPED_SOLUTIONS = [] def print_usage(): From 56d659be4b83822099c53a9f31e15afe6bae7ea7 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 30 Nov 2023 16:25:57 +0100 Subject: [PATCH 05/25] add tmate solution --- .github/workflows/regression.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index f4ecf306..1090e86b 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -54,6 +54,9 @@ jobs: shell: bash run: sudo swapoff -a + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 + - name: Run mini GroupBy benchmark shell: bash run: | From 6c69d059ddca82de920cb964b085d7a89f7d545b Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 30 Nov 2023 16:31:13 +0100 Subject: [PATCH 06/25] fix syntax mistake --- _utils/install_all_solutions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_utils/install_all_solutions.py b/_utils/install_all_solutions.py index f16aff95..c540e4ff 100755 --- a/_utils/install_all_solutions.py +++ b/_utils/install_all_solutions.py @@ -26,7 +26,7 @@ def install_all_solutions(): with open(SOLUTIONS_FILENAME, newline="") as solutions_file: solutions = csv.DictReader(solutions_file, delimiter=',') for row in solutions: - elif row['solution'] == "data.table": + if row['solution'] == "data.table": install_solutions.add("datatable") else: install_solutions.add(row['solution']) From 36593ce885b78df8b52eb88410a882a92daa5615 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 30 Nov 2023 16:45:30 +0100 Subject: [PATCH 07/25] remove clickhouse skip again --- _utils/install_all_solutions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/_utils/install_all_solutions.py b/_utils/install_all_solutions.py index c540e4ff..ec2dfd40 100755 --- a/_utils/install_all_solutions.py +++ b/_utils/install_all_solutions.py @@ -42,9 +42,7 @@ def install_all_solutions(): if solution.strip() == "all": install_all_solutions() else: - if solution == "clickhouse": - continue - elif solution == "data.table": + if solution == "data.table": install_solution("datatable") else: install_solution(solution) From 0e48ec1c138c81d49b0bd215062e35e2b9cc9f4f Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 30 Nov 2023 17:01:51 +0100 Subject: [PATCH 08/25] remove tmate session --- .github/workflows/regression.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 1090e86b..f4ecf306 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -54,9 +54,6 @@ jobs: shell: bash run: sudo swapoff -a - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - - name: Run mini GroupBy benchmark shell: bash run: | From 5c9af39d7fb0d8802bcad9c82ce1c5cea8576542 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 30 Nov 2023 17:09:31 +0100 Subject: [PATCH 09/25] also stop clickhouse --- clickhouse/setup-clickhouse.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clickhouse/setup-clickhouse.sh b/clickhouse/setup-clickhouse.sh index 4c6e87cc..8d442835 100755 --- a/clickhouse/setup-clickhouse.sh +++ b/clickhouse/setup-clickhouse.sh @@ -14,7 +14,7 @@ sudo rm /var/log/clickhouse-server/clickhouse-server.err.log /var/log/clickhouse sudo service clickhouse-server start # stop server -#sudo service clickhouse-server stop +sudo service clickhouse-server stop # let file table function access csv -- NO LONGER NECESSARY # grep '/var/lib/clickhouse/user_files/' /etc/clickhouse-server/config.xml From 8ef6e508db349825a075f9ba893c741b75989238 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 30 Nov 2023 19:05:06 +0100 Subject: [PATCH 10/25] only check if not test run --- clickhouse/clickhouseOutput.txt | 51 +++++++++++++++++++++++++++++++++ clickhouse/exec.sh | 20 +++++++++---- 2 files changed, 66 insertions(+), 5 deletions(-) create mode 100644 clickhouse/clickhouseOutput.txt diff --git a/clickhouse/clickhouseOutput.txt b/clickhouse/clickhouseOutput.txt new file mode 100644 index 00000000..03780bfe --- /dev/null +++ b/clickhouse/clickhouseOutput.txt @@ -0,0 +1,51 @@ + timeout_s compare run_batch + 1: 3600 NA + 2: 3600 NA + 3: 3600 NA + 4: 3600 NA + 5: 3600 NA + 6: 7200 NA + 7: 7200 NA + 8: 7200 NA + 9: 7200 NA +10: 7200 NA +11: 10800 NA +12: 10800 NA +13: 10800 NA +14: 10800 NA +15: 10800 NA +start: clickhouse groupby G1_1e7_1e2_0_0 +finish: clickhouse groupby G1_1e7_1e2_0_0: 0: stderr 86 +start: clickhouse groupby G1_1e7_1e1_0_0 +finish: clickhouse groupby G1_1e7_1e1_0_0: 0: stderr 86 +start: clickhouse groupby G1_1e7_2e0_0_0 +finish: clickhouse groupby G1_1e7_2e0_0_0: 0: stderr 86 +start: clickhouse groupby G1_1e7_1e2_0_1 +finish: clickhouse groupby G1_1e7_1e2_0_1: 0: stderr 86 +start: clickhouse groupby G1_1e7_1e2_5_0 +finish: clickhouse groupby G1_1e7_1e2_5_0: 0: stderr 86 +start: clickhouse groupby G1_1e8_1e2_0_0 +finish: clickhouse groupby G1_1e8_1e2_0_0: 0: stderr 86 +start: clickhouse groupby G1_1e8_1e1_0_0 +finish: clickhouse groupby G1_1e8_1e1_0_0: 0: stderr 86 +start: clickhouse groupby G1_1e8_2e0_0_0 +finish: clickhouse groupby G1_1e8_2e0_0_0: 0: stderr 86 +start: clickhouse groupby G1_1e8_1e2_0_1 +finish: clickhouse groupby G1_1e8_1e2_0_1: 0: stderr 86 +start: clickhouse groupby G1_1e8_1e2_5_0 +finish: clickhouse groupby G1_1e8_1e2_5_0: 0: stderr 86 +start: clickhouse groupby G1_1e9_1e2_0_0 +finish: clickhouse groupby G1_1e9_1e2_0_0: 0: stderr 86 +start: clickhouse groupby G1_1e9_1e1_0_0 +finish: clickhouse groupby G1_1e9_1e1_0_0: 0: stderr 86 +start: clickhouse groupby G1_1e9_2e0_0_0 +finish: clickhouse groupby G1_1e9_2e0_0_0: 0: stderr 86 +start: clickhouse groupby G1_1e9_1e2_0_1 +finish: clickhouse groupby G1_1e9_1e2_0_1: 0: stderr 86 +start: clickhouse groupby G1_1e9_1e2_5_0 +finish: clickhouse groupby G1_1e9_1e2_5_0: 0: stderr 86 +# Rendering report +# Benchmark index report produced +# Benchmark history report produced +# Benchmark tech report produced +# Benchmark run 1701338941 has been completed in 4030s \ No newline at end of file diff --git a/clickhouse/exec.sh b/clickhouse/exec.sh index 1b697c12..4d7c799c 100755 --- a/clickhouse/exec.sh +++ b/clickhouse/exec.sh @@ -34,29 +34,39 @@ if [ $1 == 'groupby' ]; then clickhouse-client --query "CREATE TABLE $SRC_DATANAME (id1 Nullable(String), id2 Nullable(String), id3 Nullable(String), id4 Nullable(Int32), id5 Nullable(Int32), id6 Nullable(Int32), v1 Nullable(Int32), v2 Nullable(Int32), v3 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();" tail -n+2 data/$SRC_DATANAME.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $SRC_DATANAME SELECT * FROM input('id1 Nullable(String), id2 Nullable(String), id3 Nullable(String), id4 Nullable(Int32), id5 Nullable(Int32), id6 Nullable(Int32), v1 Nullable(Int32), v2 Nullable(Int32), v3 Nullable(Float64)') FORMAT CSV" # confirm all data loaded yandex/ClickHouse#4463 - echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)' + if [ ! $TEST_RUN ]; then + echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)' + fi elif [ $1 == 'join' ]; then # lhs clickhouse-client --query "DROP TABLE IF EXISTS $SRC_DATANAME" clickhouse-client --query "CREATE TABLE $SRC_DATANAME (id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v1 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();" tail -n+2 data/$SRC_DATANAME.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $SRC_DATANAME SELECT * FROM input('id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v1 Nullable(Float64)') FORMAT CSV" - echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)' + if [ ! $TEST_RUN ]; then + echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)' + fi RHS=$(join_to_tbls $SRC_DATANAME) RHS1=$(echo $RHS | cut -d' ' -f1) clickhouse-client --query "DROP TABLE IF EXISTS $RHS1" clickhouse-client --query "CREATE TABLE $RHS1 (id1 Nullable(Int32), id4 Nullable(String), v2 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();" tail -n+2 data/$RHS1.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS1 SELECT * FROM input('id1 Nullable(Int32), id4 Nullable(String), v2 Nullable(Float64)') FORMAT CSV" - echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS1'\n$(echo $RHS1 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)' + if [ ! $TEST_RUN ]; then + echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS1'\n$(echo $RHS1 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)' + fi RHS2=$(echo $RHS | cut -d' ' -f2) clickhouse-client --query "DROP TABLE IF EXISTS $RHS2" clickhouse-client --query "CREATE TABLE $RHS2 (id1 Nullable(Int32), id2 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), v2 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();" tail -n+2 data/$RHS2.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS2 SELECT * FROM input('id1 Nullable(Int32), id2 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), v2 Nullable(Float64)') FORMAT CSV" - echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS2'\n$(echo $RHS2 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)' + if [ ! $TEST_RUN ]; then + echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS2'\n$(echo $RHS2 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)' + fi RHS3=$(echo $RHS | cut -d' ' -f3) clickhouse-client --query "DROP TABLE IF EXISTS $RHS3" clickhouse-client --query "CREATE TABLE $RHS3 (id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v2 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();" tail -n+2 data/$RHS3.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS3 SELECT * FROM input('id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v2 Nullable(Float64)') FORMAT CSV" - echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS3'\n$(echo $RHS3 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)' + if [ ! $TEST_RUN ]; then + echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS3'\n$(echo $RHS3 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)' + fi else echo "clickhouse task $1 not implemented" >&2 && exit 1 fi From bb6ab4591fa71c16e10a1b04008570d0d86750d3 Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Fri, 1 Dec 2023 13:02:20 +0100 Subject: [PATCH 11/25] add tmate session back. Check for error and exception --- .github/workflows/regression.yml | 3 ++ _utils/validate_no_errors.sh | 4 +-- clickhouse/clickhouseOutput.txt | 51 -------------------------------- 3 files changed, 5 insertions(+), 53 deletions(-) delete mode 100644 clickhouse/clickhouseOutput.txt diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index f4ecf306..1090e86b 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -54,6 +54,9 @@ jobs: shell: bash run: sudo swapoff -a + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 + - name: Run mini GroupBy benchmark shell: bash run: | diff --git a/_utils/validate_no_errors.sh b/_utils/validate_no_errors.sh index cd855369..4ee74097 100755 --- a/_utils/validate_no_errors.sh +++ b/_utils/validate_no_errors.sh @@ -1,10 +1,10 @@ -if [ $(grep -i "error" out/run_*.err | wc -l) = 0 ] +if [ $(grep -i "[error|exception]" out/run_*.err | wc -l) = 0 ] then # no true errors found, print last line of each output script echo "No Errors found in run_*.err logs" else echo "The following errors have been found. Failing check" - grep -i "error" out/*.err + grep -i "[error|exception]" out/*.err exit 1 fi diff --git a/clickhouse/clickhouseOutput.txt b/clickhouse/clickhouseOutput.txt deleted file mode 100644 index 03780bfe..00000000 --- a/clickhouse/clickhouseOutput.txt +++ /dev/null @@ -1,51 +0,0 @@ - timeout_s compare run_batch - 1: 3600 NA - 2: 3600 NA - 3: 3600 NA - 4: 3600 NA - 5: 3600 NA - 6: 7200 NA - 7: 7200 NA - 8: 7200 NA - 9: 7200 NA -10: 7200 NA -11: 10800 NA -12: 10800 NA -13: 10800 NA -14: 10800 NA -15: 10800 NA -start: clickhouse groupby G1_1e7_1e2_0_0 -finish: clickhouse groupby G1_1e7_1e2_0_0: 0: stderr 86 -start: clickhouse groupby G1_1e7_1e1_0_0 -finish: clickhouse groupby G1_1e7_1e1_0_0: 0: stderr 86 -start: clickhouse groupby G1_1e7_2e0_0_0 -finish: clickhouse groupby G1_1e7_2e0_0_0: 0: stderr 86 -start: clickhouse groupby G1_1e7_1e2_0_1 -finish: clickhouse groupby G1_1e7_1e2_0_1: 0: stderr 86 -start: clickhouse groupby G1_1e7_1e2_5_0 -finish: clickhouse groupby G1_1e7_1e2_5_0: 0: stderr 86 -start: clickhouse groupby G1_1e8_1e2_0_0 -finish: clickhouse groupby G1_1e8_1e2_0_0: 0: stderr 86 -start: clickhouse groupby G1_1e8_1e1_0_0 -finish: clickhouse groupby G1_1e8_1e1_0_0: 0: stderr 86 -start: clickhouse groupby G1_1e8_2e0_0_0 -finish: clickhouse groupby G1_1e8_2e0_0_0: 0: stderr 86 -start: clickhouse groupby G1_1e8_1e2_0_1 -finish: clickhouse groupby G1_1e8_1e2_0_1: 0: stderr 86 -start: clickhouse groupby G1_1e8_1e2_5_0 -finish: clickhouse groupby G1_1e8_1e2_5_0: 0: stderr 86 -start: clickhouse groupby G1_1e9_1e2_0_0 -finish: clickhouse groupby G1_1e9_1e2_0_0: 0: stderr 86 -start: clickhouse groupby G1_1e9_1e1_0_0 -finish: clickhouse groupby G1_1e9_1e1_0_0: 0: stderr 86 -start: clickhouse groupby G1_1e9_2e0_0_0 -finish: clickhouse groupby G1_1e9_2e0_0_0: 0: stderr 86 -start: clickhouse groupby G1_1e9_1e2_0_1 -finish: clickhouse groupby G1_1e9_1e2_0_1: 0: stderr 86 -start: clickhouse groupby G1_1e9_1e2_5_0 -finish: clickhouse groupby G1_1e9_1e2_5_0: 0: stderr 86 -# Rendering report -# Benchmark index report produced -# Benchmark history report produced -# Benchmark tech report produced -# Benchmark run 1701338941 has been completed in 4030s \ No newline at end of file From 0e08e858aef7f5389b73d9a35f1f0874044515a7 Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Fri, 1 Dec 2023 13:49:42 +0100 Subject: [PATCH 12/25] run polars and clickhouse. solution verify needs both results to work --- _utils/install_all_solutions.py | 3 +++ _utils/prep_solutions.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/_utils/install_all_solutions.py b/_utils/install_all_solutions.py index ec2dfd40..56c4065d 100755 --- a/_utils/install_all_solutions.py +++ b/_utils/install_all_solutions.py @@ -44,6 +44,9 @@ def install_all_solutions(): else: if solution == "data.table": install_solution("datatable") + if solution == "clickhouse": + install_solution("clickhouse") + install_solution("polars") else: install_solution(solution) diff --git a/_utils/prep_solutions.py b/_utils/prep_solutions.py index 41637d62..ed3e4a24 100755 --- a/_utils/prep_solutions.py +++ b/_utils/prep_solutions.py @@ -33,6 +33,8 @@ def main(): solution = parse_solution() if solution == "all": solution = get_solutions(task) + if solution == "clickhouse": + solution = "clickhouse polars" update_run_conf_solutions(solution, task) def update_run_conf_solutions(solution_name_list, task): From ab7eed77738ab2d68732c92a17e0fa9c7c799b7d Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Fri, 1 Dec 2023 14:24:49 +0100 Subject: [PATCH 13/25] run a benchmark twice --- .github/workflows/regression.yml | 3 --- _utils/generate-data-small.sh | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 1090e86b..f4ecf306 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -54,9 +54,6 @@ jobs: shell: bash run: sudo swapoff -a - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - - name: Run mini GroupBy benchmark shell: bash run: | diff --git a/_utils/generate-data-small.sh b/_utils/generate-data-small.sh index 44774469..a23d83ce 100755 --- a/_utils/generate-data-small.sh +++ b/_utils/generate-data-small.sh @@ -22,6 +22,9 @@ mv _control/data.csv _control/data.csv.original echo "task,data,nrow,k,na,sort,active" > _control/data.csv echo "groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1" >> _control/data.csv +# put this twice. clickhouse (for some reason), errors on the first attempt to run the benchmark +# running this dataset twice will overwrite the old error file. +echo "groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1" >> _control/data.csv echo "groupby,G1_1e7_1e2_15_0,1e7,1e2,15,0,1" >> _control/data.csv echo "groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1" >> _control/data.csv echo "join,J1_1e7_NA_0_0,1e7,NA,0,0,1" >> _control/data.csv From ccd03e64dfab6c40af2d66fdf3ee92d73728971d Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Fri, 1 Dec 2023 14:55:06 +0100 Subject: [PATCH 14/25] if clickhouse run an initial group by --- .github/workflows/regression.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index f4ecf306..8678331a 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -54,6 +54,14 @@ jobs: shell: bash run: sudo swapoff -a + - name: Run mini GroupBy benchmark if clickhouse + shell: bash + if: ${{ matrix.solution }} == "clickhouse" || ${{ solution }} == "all" + run: | + python3 _utils/prep_solutions.py --task=groupby --solution=clickhouse + source path.env + TEST_RUN=true ./run.sh + - name: Run mini GroupBy benchmark shell: bash run: | From bd7281e2596d1964aa455a4682feca4ee37ef762 Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Fri, 1 Dec 2023 14:58:51 +0100 Subject: [PATCH 15/25] matrix.solution --- .github/workflows/regression.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 8678331a..590701cf 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -56,7 +56,7 @@ jobs: - name: Run mini GroupBy benchmark if clickhouse shell: bash - if: ${{ matrix.solution }} == "clickhouse" || ${{ solution }} == "all" + if: ${{ matrix.solution }} == "clickhouse" || ${{ matrix.solution }} == "all" run: | python3 _utils/prep_solutions.py --task=groupby --solution=clickhouse source path.env From c7422aeaf44ec26bd8c9549498fac0939fa7acd8 Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Fri, 1 Dec 2023 15:13:25 +0100 Subject: [PATCH 16/25] remove duplicate test cases --- _utils/generate-data-small.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/_utils/generate-data-small.sh b/_utils/generate-data-small.sh index a23d83ce..44774469 100755 --- a/_utils/generate-data-small.sh +++ b/_utils/generate-data-small.sh @@ -22,9 +22,6 @@ mv _control/data.csv _control/data.csv.original echo "task,data,nrow,k,na,sort,active" > _control/data.csv echo "groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1" >> _control/data.csv -# put this twice. clickhouse (for some reason), errors on the first attempt to run the benchmark -# running this dataset twice will overwrite the old error file. -echo "groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1" >> _control/data.csv echo "groupby,G1_1e7_1e2_15_0,1e7,1e2,15,0,1" >> _control/data.csv echo "groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1" >> _control/data.csv echo "join,J1_1e7_NA_0_0,1e7,NA,0,0,1" >> _control/data.csv From ec99cb306647656fa54c83310835d3f9b8c3c868 Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Fri, 1 Dec 2023 15:21:18 +0100 Subject: [PATCH 17/25] fix if statement --- .github/workflows/regression.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 590701cf..3b0edece 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -56,7 +56,7 @@ jobs: - name: Run mini GroupBy benchmark if clickhouse shell: bash - if: ${{ matrix.solution }} == "clickhouse" || ${{ matrix.solution }} == "all" + if: ${{ matrix.solution == "clickhouse" || matrix.solution == "all" }} run: | python3 _utils/prep_solutions.py --task=groupby --solution=clickhouse source path.env From 0c933868651ce9aba343b41189831dfa940312ea Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Fri, 1 Dec 2023 15:38:32 +0100 Subject: [PATCH 18/25] double quote to singe --- .github/workflows/regression.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 3b0edece..5a2b9eed 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -56,7 +56,7 @@ jobs: - name: Run mini GroupBy benchmark if clickhouse shell: bash - if: ${{ matrix.solution == "clickhouse" || matrix.solution == "all" }} + if: ${{ matrix.solution == 'clickhouse' || matrix.solution == 'all' }} run: | python3 _utils/prep_solutions.py --task=groupby --solution=clickhouse source path.env From e071dec6b5df84ae17b629bec25d1283bf7084a7 Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Fri, 1 Dec 2023 15:45:45 +0100 Subject: [PATCH 19/25] fix datatable install --- _utils/install_all_solutions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_utils/install_all_solutions.py b/_utils/install_all_solutions.py index 56c4065d..c000e521 100755 --- a/_utils/install_all_solutions.py +++ b/_utils/install_all_solutions.py @@ -44,7 +44,7 @@ def install_all_solutions(): else: if solution == "data.table": install_solution("datatable") - if solution == "clickhouse": + elif solution == "clickhouse": install_solution("clickhouse") install_solution("polars") else: From e1f00ebe379bd89832e5cd7fa40c5a3e50f0f85a Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Fri, 1 Dec 2023 16:15:31 +0100 Subject: [PATCH 20/25] fix regex --- _utils/validate_no_errors.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_utils/validate_no_errors.sh b/_utils/validate_no_errors.sh index 4ee74097..9c8fa4ef 100755 --- a/_utils/validate_no_errors.sh +++ b/_utils/validate_no_errors.sh @@ -1,10 +1,10 @@ -if [ $(grep -i "[error|exception]" out/run_*.err | wc -l) = 0 ] +if [ $(grep -i 'error|exception' out/run_*.err | wc -l) = 0 ] then # no true errors found, print last line of each output script echo "No Errors found in run_*.err logs" else echo "The following errors have been found. Failing check" - grep -i "[error|exception]" out/*.err + grep -i "error|exception" out/*.err exit 1 fi From bd1beb2a21c01aecc04202c3e82f9c97661b7787 Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Mon, 4 Dec 2023 10:22:24 +0100 Subject: [PATCH 21/25] add comment explaining why we run clickhouse twice --- .github/workflows/regression.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 5a2b9eed..b17ee50f 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -54,6 +54,10 @@ jobs: shell: bash run: sudo swapoff -a + + # needed because clickhouse for some reason produces an error the first + # time a benchmark is run. The next benchmark run will work and overwrite the + # old benchmark files. - name: Run mini GroupBy benchmark if clickhouse shell: bash if: ${{ matrix.solution == 'clickhouse' || matrix.solution == 'all' }} From c84afb2ea6d5787ad9fc18dc2ddb39c997828b9c Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Wed, 6 Dec 2023 09:38:20 +0100 Subject: [PATCH 22/25] Rename R to R-arrow (#68) * should change arrow to show R-arrow * new arrow benchmarks report solution as R-arrow * update arrow to R-arrow in a few more places * Fix remaining issues in https://github.com/Tmonster/db-benchmark/pull/10 (#13) * Fix remaining issues in arrow -> R-arrow rename * Fix bug in rename code in report.R The previous code was causing something wild to happen. The changed code is idiomatic code for replacing values in a data.frame based on a condition. --------- Co-authored-by: Bryce Mecum --- .github/workflows/regression.yml | 2 +- arrow/groupby-arrow.R => R-arrow/groupby-R-arrow.R | 6 +++--- arrow/join-arrow.R => R-arrow/join-R-arrow.R | 6 +++--- R-arrow/setup-R-arrow.sh | 6 ++++++ arrow/upg-arrow.sh => R-arrow/upg-R-arrow.sh | 2 +- R-arrow/ver-R-arrow.sh | 4 ++++ _benchplot/benchplot-dict.R | 14 +++++++------- _control/solutions.csv | 4 ++-- _launcher/launcher.R | 2 +- _launcher/solution.R | 2 +- _report/report.R | 13 ++++++++++--- arrow/setup-arrow.sh | 6 ------ arrow/ver-arrow.sh | 4 ---- run.conf | 2 +- run.sh | 4 ++-- 15 files changed, 42 insertions(+), 35 deletions(-) rename arrow/groupby-arrow.R => R-arrow/groupby-R-arrow.R (98%) rename arrow/join-arrow.R => R-arrow/join-R-arrow.R (97%) create mode 100755 R-arrow/setup-R-arrow.sh rename arrow/upg-arrow.sh => R-arrow/upg-R-arrow.sh (55%) create mode 100755 R-arrow/ver-R-arrow.sh delete mode 100755 arrow/setup-arrow.sh delete mode 100755 arrow/ver-arrow.sh diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 53a7684e..12a955cf 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion] + solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, duckdb-latest, datafusion] name: Regression Tests solo solutions runs-on: ubuntu-20.04 env: diff --git a/arrow/groupby-arrow.R b/R-arrow/groupby-R-arrow.R similarity index 98% rename from arrow/groupby-arrow.R rename to R-arrow/groupby-R-arrow.R index 950bcff0..100d3dec 100755 --- a/arrow/groupby-arrow.R +++ b/R-arrow/groupby-R-arrow.R @@ -7,13 +7,13 @@ source("./_helpers/helpers.R") stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns .libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well suppressPackageStartupMessages({ - library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE) - library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE) + library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE) + library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE) }) ver = packageVersion("arrow") git = "" task = "groupby" -solution = "arrow" +solution = "R-arrow" fun = "group_by" cache = TRUE on_disk = FALSE diff --git a/arrow/join-arrow.R b/R-arrow/join-R-arrow.R similarity index 97% rename from arrow/join-arrow.R rename to R-arrow/join-R-arrow.R index 69df274d..559d05c9 100755 --- a/arrow/join-arrow.R +++ b/R-arrow/join-R-arrow.R @@ -6,13 +6,13 @@ source("./_helpers/helpers.R") .libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well suppressPackageStartupMessages({ - library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE) - library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE) + library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE) + library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE) }) ver = packageVersion("arrow") git = "" task = "join" -solution = "arrow" +solution = "R-arrow" cache = TRUE on_disk = FALSE diff --git a/R-arrow/setup-R-arrow.sh b/R-arrow/setup-R-arrow.sh new file mode 100755 index 00000000..e5ff947a --- /dev/null +++ b/R-arrow/setup-R-arrow.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# install stable arrow +mkdir -p ./R-arrow/r-arrow +Rscript -e 'install.packages(c("arrow","dplyr"), lib="./R-arrow/r-arrow")' diff --git a/arrow/upg-arrow.sh b/R-arrow/upg-R-arrow.sh similarity index 55% rename from arrow/upg-arrow.sh rename to R-arrow/upg-R-arrow.sh index d2fb9de5..4d677d3e 100755 --- a/arrow/upg-arrow.sh +++ b/R-arrow/upg-R-arrow.sh @@ -3,4 +3,4 @@ set -e # upgrade all packages in arrow library only if new arrow is out echo 'upgrading arrow...' -Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./arrow/r-arrow")) update.packages(lib.loc="./arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)' +Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./R-arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)' diff --git a/R-arrow/ver-R-arrow.sh b/R-arrow/ver-R-arrow.sh new file mode 100755 index 00000000..8c24e043 --- /dev/null +++ b/R-arrow/ver-R-arrow.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("R-arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R index 6ac2df8a..63a80206 100644 --- a/_benchplot/benchplot-dict.R +++ b/_benchplot/benchplot-dict.R @@ -42,7 +42,7 @@ solution.dict = {list( "juliads" = list(name=c(short="IMD.jl", long="InMemoryDatasets.jl"), color=c(strong="#b80000", light="#ff1f1f")), "clickhouse" = list(name=c(short="clickhouse", long="ClickHouse"), color=c(strong="hotpink4", light="hotpink1")), "polars" = list(name=c(short="polars", long="Polars"), color=c(strong="deepskyblue4", light="deepskyblue3")), - "arrow" = list(name=c(short="arrow", long="Arrow"), color=c(strong="aquamarine3", light="aquamarine1")), + "R-arrow" = list(name=c(short="R-arrow", long="R-arrow"), color=c(strong="aquamarine3", light="aquamarine1")), "duckdb" = list(name=c(short="duckdb", long="DuckDB"), color=c(strong="#ddcd07", light="#fff100")), "duckdb-latest" = list(name=c(short="duckdb-latest", long="duckdb-latest"), color=c(strong="#ddcd07", light="#fff100")), "datafusion" = list(name=c(short="datafusion", long="Datafusion"), color=c(strong="deepskyblue4", light="deepskyblue3")) @@ -199,7 +199,7 @@ groupby.syntax.dict = {list( "regression v1 v2 by id2 id4" = "DF.groupby(['id2','id4']).agg((pl.pearson_corr('v1','v2')**2).alias('r2')).collect()", "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6']).agg([pl.sum('v3').alias('v3'), pl.count('v1').alias('count')]).collect()" )}, - "arrow" = {c( + "R-arrow" = {c( "sum v1 by id1" = "AT %>% group_by(id1) %>% summarise(v1=sum(v1, na.rm=TRUE))", "sum v1 by id1:id2" = "AT %>% group_by(id1, id2) %>% summarise(v1=sum(v1, na.rm=TRUE))", "sum v1 mean v3 by id3" = "AT %>% group_by(id3) %>% summarise(v1=sum(v1, na.rm=TRUE), v3=mean(v3, na.rm=TRUE))", @@ -260,7 +260,7 @@ groupby.syntax.dict = {list( "juliads" = list(), "clickhouse" = list(), "polars" = list(), - "arrow" = list("Expression row_number() <= 2L not supported in Arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in arrow; pulling data into R" = "regression v1 v2 by id2 id4"), + "R-arrow" = list("Expression row_number() <= 2L not supported in R-arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in R-arrow; pulling data into R" = "regression v1 v2 by id2 id4"), "duckdb" = list(), "duckdb-latest" = list(), "datafusion" = list() @@ -309,7 +309,7 @@ groupby.data.exceptions = {list( "polars" = {list( # "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0") # q10 )}, - "arrow" = {list( + "R-arrow" = {list( # "timeout" = c(), # q10 "internal error" = c("G1_1e8_2e0_0_0", "G1_1e8_1e2_0_1", "G1_1e8_1e2_5_0", "G1_1e9_1e2_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0","G1_1e9_1e1_0_0", # inherits from dplyr "G1_1e9_2e0_0_0"), # #190 @@ -413,7 +413,7 @@ join.syntax.dict = {list( "medium inner on factor" = "DF.merge(medium, on='id5')", "big inner on int" = "DF.merge(big, on='id3')" )}, - "arrow" = {c( + "R-arrow" = {c( "small inner on int" = "inner_join(DF, small, by='id1')", "medium inner on int" = "inner_join(DF, medium, by='id2')", "medium outer on int" = "left_join(DF, medium, by='id2')", @@ -454,7 +454,7 @@ join.query.exceptions = {list( "juliads" = list(), "clickhouse" = list(), "polars" = list(), - "arrow" = list(), + "R-arrow" = list(), "duckdb" = list(), "duckdb-latest" = list(), "datafusion" = list() @@ -496,7 +496,7 @@ join.data.exceptions = {list( "polars" = {list( "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") )}, - "arrow" = {list( + "R-arrow" = {list( "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1" )#, # "not yet implemented: #189" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1","J1_1e8_NA_0_0","J1_1e8_NA_5_0","J1_1e8_NA_0_1","J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") )}, diff --git a/_control/solutions.csv b/_control/solutions.csv index c96f07cf..89009a06 100644 --- a/_control/solutions.csv +++ b/_control/solutions.csv @@ -25,8 +25,8 @@ clickhouse,groupby clickhouse,join polars,groupby polars,join -arrow,groupby -arrow,join +R-arrow,groupby +R-arrow,join duckdb,groupby duckdb,join duckdb-latest,groupby diff --git a/_launcher/launcher.R b/_launcher/launcher.R index 167d9dee..c2d59bdb 100644 --- a/_launcher/launcher.R +++ b/_launcher/launcher.R @@ -14,7 +14,7 @@ readret = function(x) { file.ext = function(x) { ans = switch( x, - "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", + "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R", "pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl", diff --git a/_launcher/solution.R b/_launcher/solution.R index f66b4311..35d3a6a2 100755 --- a/_launcher/solution.R +++ b/_launcher/solution.R @@ -110,7 +110,7 @@ if ("quiet" %in% names(args)) { file.ext = function(x) { ans = switch( x, - "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", + "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R", "pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl" diff --git a/_report/report.R b/_report/report.R index 35082113..29405a0d 100644 --- a/_report/report.R +++ b/_report/report.R @@ -6,7 +6,7 @@ get_report_status_file = function(path=getwd()) { file.path(path, "report-done") } get_report_solutions = function() { - c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars","arrow","duckdb", "duckdb-latest", "datafusion") + c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "duckdb-latest", "datafusion", "arrow", "R-arrow") } get_data_levels = function() { ## groupby @@ -69,6 +69,9 @@ clean_time = function(d) { if (nrow(d[!nzchar(version) | is.na(version)])) stop("timings data contains NA or '' as version field, that should not happen") old_advanced_groupby_questions = c("median v3 sd v3 by id2 id4","max v1 - min v2 by id2 id4","largest two v3 by id2 id4","regression v1 v2 by id2 id4","sum v3 count by id1:id6") + + # replace arrow with R-arrow (see https://github.com/duckdblabs/db-benchmark/pull/66) + d[which(solution == "arrow"),c("solution")] == "R-arrow" d[!nzchar(git), git := NA_character_ ][,"on_disk" := as.logical(on_disk) ][task=="groupby" & solution%in%c("pandas","dask","spark") & batch<1558106628, "out_cols" := NA_integer_ @@ -243,9 +246,13 @@ transform = function(ld) { # all ---- time_logs = function(path=getwd()) { - ct = clean_time(load_time(path=getwd())) + lt <- load_time(path=getwd()) + + ct = clean_time(lt) d = model_time(ct) - l = model_logs(clean_logs(load_logs(path=path))) + ll <- load_logs(path=path) + ll$solution[ll$solution == "arrow"] <- "R-arrow" + l = model_logs(clean_logs(ll)) q = model_questions(clean_questions(load_questions(path=path))) lq = merge_logs_questions(l, q) diff --git a/arrow/setup-arrow.sh b/arrow/setup-arrow.sh deleted file mode 100755 index dcad2ad3..00000000 --- a/arrow/setup-arrow.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e - -# install stable arrow -mkdir -p ./arrow/r-arrow -Rscript -e 'install.packages(c("arrow","dplyr"), lib="./arrow/r-arrow")' diff --git a/arrow/ver-arrow.sh b/arrow/ver-arrow.sh deleted file mode 100755 index 44bb8ede..00000000 --- a/arrow/ver-arrow.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -set -e - -Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' diff --git a/run.conf b/run.conf index 14e0f435..c019b15f 100644 --- a/run.conf +++ b/run.conf @@ -1,7 +1,7 @@ # task, used in init-setup-iteration.R export RUN_TASKS="groupby join" # solution, used in init-setup-iteration.R -export RUN_SOLUTIONS="collapse data.table juliads juliadf dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb duckdb-latest datafusion" +export RUN_SOLUTIONS="collapse data.table juliads juliadf dplyr pandas pydatatable spark dask clickhouse polars R-arrow duckdb duckdb-latest datafusion" # flag to upgrade tools, used in run.sh on init export DO_UPGRADE=false diff --git a/run.sh b/run.sh index 8afc679c..e834a09e 100755 --- a/run.sh +++ b/run.sh @@ -71,8 +71,8 @@ if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/upg-h2o. if [[ "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/ver-h2o.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/upg-polars.sh; fi; if [[ "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/ver-polars.sh; fi; -if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "arrow" ]]; then ./arrow/upg-arrow.sh; fi; -if [[ "$RUN_SOLUTIONS" =~ "arrow" ]]; then ./arrow/ver-arrow.sh; fi; +if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/R-upg-arrow.sh; fi; +if [[ "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/ver-R-arrow.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb" ]]; then ./duckdb/upg-duckdb.sh; fi; if [[ "$RUN_SOLUTIONS" == "duckdb" ]]; then ./duckdb/ver-duckdb.sh; fi; if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb-latest" ]]; then ./duckdb-latest/setup-duckdb-latest.sh; fi; From 76bc7700448603e63eee4b0365c788e4a3e3c44b Mon Sep 17 00:00:00 2001 From: Miles Date: Wed, 6 Dec 2023 12:38:50 +0100 Subject: [PATCH 23/25] Dask: Refactor and improve groupby-dask (#64) * Dask: Enable Q7 to Q10 again * add dask to workflows file * bump python version * Refactor - move things to functions and move groupby-dask2.py into groupby-dask.py * Remove repetitive code into benchmark decorator * Use LocalCluster context for better cleanup * Don't explicitly set dtypes, pyarrow can figure it out * Comment out query 8, missing API * Revert refactoring, just remove explicit dtype * Fix q9 error previously fixed in refactor * Update join reading to use pyarrow and inference --------- Co-authored-by: fjetter --- .github/workflows/regression.yml | 2 +- dask/VERSION | 2 +- dask/groupby-dask2.py | 300 ++++++++++++++++--------------- dask/join-dask.py | 8 +- dask/setup-dask.sh | 4 +- 5 files changed, 165 insertions(+), 151 deletions(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 12a955cf..7cf065f7 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, duckdb-latest, datafusion] + solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, duckdb-latest, datafusion, dask] name: Regression Tests solo solutions runs-on: ubuntu-20.04 env: diff --git a/dask/VERSION b/dask/VERSION index ea516b56..3bae6081 100644 --- a/dask/VERSION +++ b/dask/VERSION @@ -1 +1 @@ -2023.10.1 \ No newline at end of file +2023.10.0 \ No newline at end of file diff --git a/dask/groupby-dask2.py b/dask/groupby-dask2.py index 41c0f231..52cb0e34 100755 --- a/dask/groupby-dask2.py +++ b/dask/groupby-dask2.py @@ -23,8 +23,6 @@ from dask import distributed # we use process-pool instead of thread-pool due to GIL cost client = distributed.Client(processes=True, silence_logs=logging.ERROR) -# since we are running on local cluster of processes, we would prefer to keep the communication between workers to relative minimum, thus it's better to trade some tasks granularity for better processing locality -dk.config.set({"optimization.fuse.ave-width": 20}) data_name = os.environ['SRC_DATANAME'] on_disk = False #data_name.split("_")[1] == "1e9" # on-disk data storage #126 @@ -38,9 +36,8 @@ exit(0) # not yet implemented #171, currently groupby's dropna=False argument is ignored print("using disk memory-mapped data storage" if on_disk else "using in-memory data storage", flush=True) -#x = dd.read_parquet(src_grp, engine="fastparquet") if on_disk else -x = dd.read_csv(src_grp, dtype={"id1":"category","id2":"category","id3":"category","id4":"Int32","id5":"Int32","id6":"Int32","v1":"Int32","v2":"Int32","v3":"float64"}) - +#x = dd.read_parquet(src_grp, engine="pyarrow") if on_disk else +x = dd.read_csv(src_grp, engine="pyarrow") x = x.persist() in_rows = len(x) @@ -189,147 +186,166 @@ print(ans.tail(3), flush=True) del ans -#question = "median v3 sd v3 by id4 id5" # q6 # median function not yet implemented: https://github.com/dask/dask/issues/4362 -#gc.collect() -#t_start = timeit.default_timer() -#ans = x.groupby(['id4','id5'], dropna=False, observed=True).agg({'v3': ['median','std']}).compute() -#ans.reset_index(inplace=True) -#print(ans.shape, flush=True) -#t = timeit.default_timer() - t_start -#m = memory_usage() -#t_start = timeit.default_timer() -#chk = [ans['v3']['median'].sum(), ans['v3']['std'].sum()] -#chkt = timeit.default_timer() - t_start -#write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#del ans -#gc.collect() -#t_start = timeit.default_timer() -#ans = x.groupby(['id4','id5'], dropna=False, observed=True).agg({'v3': ['median','std']}).compute() -#ans.reset_index(inplace=True) -#print(ans.shape, flush=True) -#t = timeit.default_timer() - t_start -#m = memory_usage() -#t_start = timeit.default_timer() -#chk = [ans['v3']['median'].sum(), ans['v3']['std'].sum()] -#chkt = timeit.default_timer() - t_start -#write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#print(ans.head(3), flush=True) -#print(ans.tail(3), flush=True) -#del ans +question = "median v3 sd v3 by id4 id5" # q6 +gc.collect() +t_start = timeit.default_timer() +ans = x.groupby(['id4','id5'], dropna=False, observed=True).agg({'v3': ['median','std']}, shuffle='p2p').compute() +ans.reset_index(inplace=True) +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v3']['median'].sum(), ans['v3']['std'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +del ans +gc.collect() +t_start = timeit.default_timer() +ans = x.groupby(['id4','id5'], dropna=False, observed=True).agg({'v3': ['median','std']}, shuffle='p2p').compute() +ans.reset_index(inplace=True) +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v3']['median'].sum(), ans['v3']['std'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(ans.head(3), flush=True) +print(ans.tail(3), flush=True) +del ans -# question = "max v1 - min v2 by id3" # q7 -# gc.collect() -# t_start = timeit.default_timer() -# ans = x.groupby('id3', dropna=False, observed=True).agg({'v1':'max', 'v2':'min'}).assign(range_v1_v2=lambda x: x['v1']-x['v2'])[['range_v1_v2']].compute() -# ans.reset_index(inplace=True) -# print(ans.shape, flush=True) -# t = timeit.default_timer() - t_start -# m = memory_usage() -# t_start = timeit.default_timer() -# chk = [ans['range_v1_v2'].sum()] -# chkt = timeit.default_timer() - t_start -# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -# del ans -# gc.collect() -# t_start = timeit.default_timer() -# ans = x.groupby('id3', dropna=False, observed=True).agg({'v1':'max', 'v2':'min'}).assign(range_v1_v2=lambda x: x['v1']-x['v2'])[['range_v1_v2']].compute() -# ans.reset_index(inplace=True) -# print(ans.shape, flush=True) -# t = timeit.default_timer() - t_start -# m = memory_usage() -# t_start = timeit.default_timer() -# chk = [ans['range_v1_v2'].sum()] -# chkt = timeit.default_timer() - t_start -# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -# print(ans.head(3), flush=True) -# print(ans.tail(3), flush=True) -# del ans +question = "max v1 - min v2 by id3" # q7 +gc.collect() +t_start = timeit.default_timer() +ans = x.groupby('id3', dropna=False, observed=True).agg({'v1':'max', 'v2':'min'}).assign(range_v1_v2=lambda x: x['v1']-x['v2'])[['range_v1_v2']].compute() +ans.reset_index(inplace=True) +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['range_v1_v2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +del ans +gc.collect() +t_start = timeit.default_timer() +ans = x.groupby('id3', dropna=False, observed=True).agg({'v1':'max', 'v2':'min'}).assign(range_v1_v2=lambda x: x['v1']-x['v2'])[['range_v1_v2']].compute() +ans.reset_index(inplace=True) +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['range_v1_v2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(ans.head(3), flush=True) +print(ans.tail(3), flush=True) +del ans -#question = "largest two v3 by id6" # q8 -#gc.collect() -#t_start = timeit.default_timer() -#ans = x[~x['v3'].isna()][['id6','v3']].groupby('id6', dropna=False, observed=True).apply(lambda x: x.nlargest(2, columns='v3'), meta={'id6':'Int64', 'v3':'float64'})[['v3']].compute() -#ans.reset_index(level='id6', inplace=True) -#ans.reset_index(drop=True, inplace=True) # drop because nlargest creates some extra new index field -#print(ans.shape, flush=True) -#t = timeit.default_timer() - t_start -#m = memory_usage() -#t_start = timeit.default_timer() -#chk = [ans['v3'].sum()] -#chkt = timeit.default_timer() - t_start -#write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#del ans -#gc.collect() -#t_start = timeit.default_timer() -#ans = x[~x['v3'].isna()][['id6','v3']].groupby('id6', dropna=False, observed=True).apply(lambda x: x.nlargest(2, columns='v3'), meta={'id6':'Int64', 'v3':'float64'})[['v3']].compute() -#ans.reset_index(level='id6', inplace=True) -#ans.reset_index(drop=True, inplace=True) -#print(ans.shape, flush=True) -#t = timeit.default_timer() - t_start -#m = memory_usage() -#t_start = timeit.default_timer() -#chk = [ans['v3'].sum()] -#chkt = timeit.default_timer() - t_start -#write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#print(ans.head(3), flush=True) -#print(ans.tail(3), flush=True) -#del ans +question = "largest two v3 by id6" # q8 +gc.collect() +t_start = timeit.default_timer() +ans = x[~x['v3'].isna()][['id6','v3']].groupby('id6', dropna=False, observed=True).apply(lambda x: x.nlargest(2, columns='v3'), meta={'id6':'Int64', 'v3':'float64'})[['v3']].compute() +ans.reset_index(level='id6', inplace=True) +ans.reset_index(drop=True, inplace=True) # drop because nlargest creates some extra new index field +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v3'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +del ans +gc.collect() +t_start = timeit.default_timer() +ans = x[~x['v3'].isna()][['id6','v3']].groupby('id6', dropna=False, observed=True).apply(lambda x: x.nlargest(2, columns='v3'), meta={'id6':'Int64', 'v3':'float64'})[['v3']].compute() +ans.reset_index(level='id6', inplace=True) +ans.reset_index(drop=True, inplace=True) +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v3'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(ans.head(3), flush=True) +print(ans.tail(3), flush=True) +del ans -#question = "regression v1 v2 by id2 id4" # q9 -#gc.collect() -#t_start = timeit.default_timer() -#ans = x[['id2','id4','v1','v2']].groupby(['id2','id4'], dropna=False, observed=True).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}), meta={'r2':'float64'}).compute() -#ans.reset_index(inplace=True) -#print(ans.shape, flush=True) -#t = timeit.default_timer() - t_start -#m = memory_usage() -#t_start = timeit.default_timer() -#chk = [ans['r2'].sum()] -#chkt = timeit.default_timer() - t_start -#write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#del ans -#gc.collect() -#t_start = timeit.default_timer() -#ans = x[['id2','id4','v1','v2']].groupby(['id2','id4'], dropna=False, observed=True).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}), meta={'r2':'float64'}).compute() -#ans.reset_index(inplace=True) -#print(ans.shape, flush=True) -#t = timeit.default_timer() - t_start -#m = memory_usage() -#t_start = timeit.default_timer() -#chk = [ans['r2'].sum()] -#chkt = timeit.default_timer() - t_start -#write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#print(ans.head(3), flush=True) -#print(ans.tail(3), flush=True) -#del ans +question = "regression v1 v2 by id2 id4" # q9 +gc.collect() +t_start = timeit.default_timer() +ans = x[['id2','id4','v1','v2']].groupby(['id2','id4'], dropna=False, observed=True)[["v1", "v2"]].apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}), meta={'r2':'float64'}).compute() +ans.reset_index(inplace=True) +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['r2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +del ans +gc.collect() +t_start = timeit.default_timer() +ans = x[['id2','id4','v1','v2']].groupby(['id2','id4'], dropna=False, observed=True)[["v1", "v2"]].apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}), meta={'r2':'float64'}).compute() +ans.reset_index(inplace=True) +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['r2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(ans.head(3), flush=True) +print(ans.tail(3), flush=True) +del ans -#question = "sum v3 count by id1:id6" # q10 -#gc.collect() -#t_start = timeit.default_timer() -#ans = x.groupby(['id1','id2','id3','id4','id5','id6'], dropna=False, observed=True).agg({'v3':'sum', 'v1':'size'}).compute() # column name different than expected, ignore it because: ValueError: Metadata inference failed in `rename`: Original error is below: ValueError('Level values must be unique: [nan, nan] on level 0',) -#ans.reset_index(inplace=True) -#print(ans.shape, flush=True) -#t = timeit.default_timer() - t_start -#m = memory_usage() -#t_start = timeit.default_timer() -#chk = [ans.v3.sum(), ans.v1.sum()] -#chkt = timeit.default_timer() - t_start -#write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#del ans -#gc.collect() -#t_start = timeit.default_timer() -#ans = x.groupby(['id1','id2','id3','id4','id5','id6'], dropna=False, observed=True).agg({'v3':'sum', 'v1':'size'}).compute() -#ans.reset_index(inplace=True) -#print(ans.shape, flush=True) -#t = timeit.default_timer() - t_start -#m = memory_usage() -#t_start = timeit.default_timer() -#chk = [ans.v3.sum(), ans.v1.sum()] -#chkt = timeit.default_timer() - t_start -#write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#print(ans.head(3), flush=True) -#print(ans.tail(3), flush=True) -#del ans +question = "sum v3 count by id1:id6" # q10 +print(question) +gc.collect() +t_start = timeit.default_timer() +ans = ( + x.groupby( + ['id1', 'id2', 'id3', 'id4', 'id5', 'id6'], + dropna=False, + observed=True, + ) + .agg({'v3': 'sum', 'v1': 'size'}, split_out=x.npartitions) + .rename(columns={"v1": "count"}) + .compute() +) +ans.reset_index(inplace=True) +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans.v3.sum(), ans["count"].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +del ans +gc.collect() +t_start = timeit.default_timer() +ans = ( + x.groupby( + ['id1', 'id2', 'id3', 'id4', 'id5', 'id6'], + dropna=False, + observed=True, + ) + .agg({'v3': 'sum', 'v1': 'size'}, split_out=x.npartitions) + .rename(columns={"v1": "count"}) + .compute() +) +ans.reset_index(inplace=True) +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans.v3.sum(), ans["count"].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(ans.head(3), flush=True) +print(ans.tail(3), flush=True) +del ans print("grouping finished, took %0.fs" % (timeit.default_timer()-task_init), flush=True) diff --git a/dask/join-dask.py b/dask/join-dask.py index 5ddcafa7..0c7197c7 100755 --- a/dask/join-dask.py +++ b/dask/join-dask.py @@ -43,10 +43,10 @@ # medium = dd.read_parquet(src_jn_y[1], engine="fastparquet") # big = dd.read_parquet(src_jn_y[2], engine="fastparquet") #else: -x = dd.read_csv(src_jn_x, dtype={'id1':'Int32','id2':'Int32','id3':'Int32','id4':'category','id5':'category','id6':'category','v1':'float64'}).persist() -small = dd.read_csv(src_jn_y[0], dtype={'id1':'Int32','id4':'category','v2':'float64'}).persist() -medium = dd.read_csv(src_jn_y[1], dtype={'id1':'Int32','id2':'Int32','id4':'category','id5':'category','v2':'float64'}).persist() -big = dd.read_csv(src_jn_y[2], dtype={'id1':'Int32','id2':'Int32','id3':'Int32','id4':'category','id5':'category','id6':'category','v2':'float64'}).persist() +x = dd.read_csv(src_jn_x, engine="pyarrow").persist() +small = dd.read_csv(src_jn_y[0], engine="pyarrow").persist() +medium = dd.read_csv(src_jn_y[1], engine="pyarrow").persist() +big = dd.read_csv(src_jn_y[2], engine="pyarrow").persist() in_rows = len(x) print(in_rows, flush=True) diff --git a/dask/setup-dask.sh b/dask/setup-dask.sh index f22e3148..c6fac985 100755 --- a/dask/setup-dask.sh +++ b/dask/setup-dask.sh @@ -1,13 +1,11 @@ #!/bin/bash set -e -virtualenv dask/py-dask --python=python3 +virtualenv dask/py-dask --python=python3.10 source dask/py-dask/bin/activate # install binaries python3 -m pip install "dask[complete]" -python3 -m pip install pandas psutil -python3 -m pip install distributed # check # python3 From 966765f7a37697ddda4dfb2d34d849ab4dc0d484 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 6 Dec 2023 11:40:29 +0000 Subject: [PATCH 24/25] add results for dask --- logs.csv | 45 ++++++++++++ time.csv | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+) diff --git a/logs.csv b/logs.csv index 1775ca19..7ce38137 100644 --- a/logs.csv +++ b/logs.csv @@ -899,3 +899,48 @@ ip-172-31-31-147,1699289348,arrow,13.0.0.1,,join,J1_1e9_NA_0_0,1699309325.24506, ip-172-31-31-147,1699289348,arrow,13.0.0.1,,join,J1_1e9_NA_0_0,1699309934.66574,finish,1,137 ip-172-31-31-147,1699437325,data.table,1.14.9,88039186915028ab3c93ccfd8e22c0d1c3534b1a,join,J1_1e9_NA_0_0,1699437325.61783,start,, ip-172-31-31-147,1699437325,data.table,1.14.9,88039186915028ab3c93ccfd8e22c0d1c3534b1a,join,J1_1e9_NA_0_0,1699451725.72193,finish,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e7_1e2_0_0,1701270373.98008,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e7_1e2_0_0,1701270419.69202,finish,0,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e7_1e1_0_0,1701270434.7073,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e7_1e1_0_0,1701270696.40654,finish,0,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e7_2e0_0_0,1701270711.42164,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e7_2e0_0_0,1701271665.05859,finish,0,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e7_1e2_0_1,1701271680.07383,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e7_1e2_0_1,1701271724.90644,finish,0,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e7_1e2_5_0,1701271739.92167,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e7_1e2_5_0,1701271741.80487,finish,1,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e8_1e2_0_0,1701271756.82011,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e8_1e2_0_0,1701272323.32654,finish,0,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e8_1e1_0_0,1701272338.34178,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e8_1e1_0_0,1701279538.71904,finish,5477,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e8_2e0_0_0,1701279553.73412,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e8_2e0_0_0,1701286773.73856,finish,1053,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e8_1e2_0_1,1701286788.7538,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e8_1e2_0_1,1701287351.43274,finish,0,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e8_1e2_5_0,1701287366.44798,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e8_1e2_5_0,1701287368.35025,finish,1,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e9_1e2_0_0,1701287383.3655,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e9_1e2_0_0,1701298184.22329,finish,16301,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e9_1e1_0_0,1701298199.23849,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e9_1e1_0_0,1701308999.66472,finish,3388,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e9_2e0_0_0,1701309014.67983,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e9_2e0_0_0,1701319814.99198,finish,2416,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e9_1e2_0_1,1701319830.00705,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e9_1e2_0_1,1701330630.76343,finish,30812,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e9_1e2_5_0,1701330645.77865,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,groupby,G1_1e9_1e2_5_0,1701330648.18685,finish,1,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e7_NA_0_0,1701330663.20195,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e7_NA_0_0,1701330684.53032,finish,0,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e7_NA_5_0,1701330699.54542,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e7_NA_5_0,1701330700.17275,finish,1,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e7_NA_0_1,1701330715.18799,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e7_NA_0_1,1701330715.81832,finish,1,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e8_NA_0_0,1701330730.83357,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e8_NA_0_0,1701331031.18587,finish,0,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e8_NA_5_0,1701331046.19362,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e8_NA_5_0,1701331046.8236,finish,1,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e8_NA_0_1,1701331061.83881,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e8_NA_0_1,1701331062.46537,finish,1,0 +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e9_NA_0_0,1701331077.48058,start,, +ip-172-31-31-147,1701270373,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,join,J1_1e9_NA_0_0,1701332733.02768,finish,1,137 + diff --git a/time.csv b/time.csv index ab100aa0..e98a5e29 100644 --- a/time.csv +++ b/time.csv @@ -6267,3 +6267,220 @@ ip-172-31-31-147,1699289348,1699301553,join,J1_1e9_NA_0_0,1000000000,small inner ip-172-31-31-147,1699289348,1699309384,join,J1_1e9_NA_0_0,1000000000,small inner on int,899999033,9,arrow,13.0.0.1,,inner_join,1,9.688,NA,TRUE,44998904641;45286789554,1.74,NA,FALSE ip-172-31-31-147,1699289348,1699309393,join,J1_1e9_NA_0_0,1000000000,small inner on int,899999033,9,arrow,13.0.0.1,,inner_join,2,6.281,NA,TRUE,44998904641;45286789554,1.965,NA,FALSE ip-172-31-31-147,1699437325,1699447786,join,J1_1e9_NA_0_0,1000000000,small inner on int,899999033,9,data.table,1.14.9,88039186915028ab3c93ccfd8e22c0d1c3534b1a,[.data.table,1,18.94,NA,TRUE,44998904641;45286789554,3.059,NA,FALSE +ip-172-31-31-147,1701270373,1701270376.9167728,groupby,G1_1e7_1e2_0_0,10000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.084,0.144,TRUE,29998789,0.0,,False +ip-172-31-31-147,1701270373,1701270377.0322177,groupby,G1_1e7_1e2_0_0,10000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.083,0.144,TRUE,29998789,0.0,,False +ip-172-31-31-147,1701270373,1701270377.2725596,groupby,G1_1e7_1e2_0_0,10000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.2,0.145,TRUE,29998789,0.0,,False +ip-172-31-31-147,1701270373,1701270377.4609835,groupby,G1_1e7_1e2_0_0,10000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.148,0.145,TRUE,29998789,0.0,,False +ip-172-31-31-147,1701270373,1701270377.787234,groupby,G1_1e7_1e2_0_0,10000000,sum v1 mean v3 by id3,100000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.288,0.153,TRUE,29998789;4999719.622,0.0,,False +ip-172-31-31-147,1701270373,1701270378.1396706,groupby,G1_1e7_1e2_0_0,10000000,sum v1 mean v3 by id3,100000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.318,0.154,TRUE,29998789;4999719.622,0.0,,False +ip-172-31-31-147,1701270373,1701270378.2712116,groupby,G1_1e7_1e2_0_0,10000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.092,0.148,TRUE,299.988;799.894;4999.767,0.0,,False +ip-172-31-31-147,1701270373,1701270378.3957121,groupby,G1_1e7_1e2_0_0,10000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.091,0.148,TRUE,299.988;799.894;4999.767,0.0,,False +ip-172-31-31-147,1701270373,1701270378.6309402,groupby,G1_1e7_1e2_0_0,10000000,sum v1:v3 by id6,100000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.196,0.153,TRUE,29998789;79989360;499976651.408,0.001,,False +ip-172-31-31-147,1701270373,1701270378.935985,groupby,G1_1e7_1e2_0_0,10000000,sum v1:v3 by id6,100000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.271,0.152,TRUE,29998789;79989360;499976651.408,0.001,,False +ip-172-31-31-147,1701270373,1701270381.7563255,groupby,G1_1e7_1e2_0_0,10000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.777,0.148,TRUE,499920.14;288648.108,0.001,,False +ip-172-31-31-147,1701270373,1701270384.5215893,groupby,G1_1e7_1e2_0_0,10000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.724,0.148,TRUE,499920.14;288648.108,0.001,,False +ip-172-31-31-147,1701270373,1701270384.834282,groupby,G1_1e7_1e2_0_0,10000000,max v1 - min v2 by id3,100000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.274,0.154,TRUE,399882,0.0,,False +ip-172-31-31-147,1701270373,1701270385.1283183,groupby,G1_1e7_1e2_0_0,10000000,max v1 - min v2 by id3,100000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.26,0.154,TRUE,399882,0.0,,False +ip-172-31-31-147,1701270373,1701270396.1866038,groupby,G1_1e7_1e2_0_0,10000000,largest two v3 by id6,200000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,11.021,0.166,TRUE,19700450.588,0.0,,False +ip-172-31-31-147,1701270373,1701270406.9602866,groupby,G1_1e7_1e2_0_0,10000000,largest two v3 by id6,200000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,10.737,0.172,TRUE,19700450.588,0.0,,False +ip-172-31-31-147,1701270373,1701270408.644578,groupby,G1_1e7_1e2_0_0,10000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,1.644,0.157,TRUE,9.839,0.0,,False +ip-172-31-31-147,1701270373,1701270410.1822402,groupby,G1_1e7_1e2_0_0,10000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,1.502,0.157,TRUE,9.839,0.0,,False +ip-172-31-31-147,1701270373,1701270414.7781541,groupby,G1_1e7_1e2_0_0,10000000,sum v3 count by id1:id6,10000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,4.537,0.871,TRUE,499976651.408;10000000,0.019,,False +ip-172-31-31-147,1701270373,1701270419.2975392,groupby,G1_1e7_1e2_0_0,10000000,sum v3 count by id1:id6,10000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,4.444,1.107,TRUE,499976651.408;10000000,0.019,,False +ip-172-31-31-147,1701270373,1701270437.7671146,groupby,G1_1e7_1e1_0_0,10000000,sum v1 by id1,10,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.075,0.146,TRUE,29998597,0.0,,False +ip-172-31-31-147,1701270373,1701270437.8766892,groupby,G1_1e7_1e1_0_0,10000000,sum v1 by id1,10,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.076,0.146,TRUE,29998597,0.0,,False +ip-172-31-31-147,1701270373,1701270438.0429707,groupby,G1_1e7_1e1_0_0,10000000,sum v1 by id1:id2,100,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.129,0.146,TRUE,29998597,0.0,,False +ip-172-31-31-147,1701270373,1701270438.201231,groupby,G1_1e7_1e1_0_0,10000000,sum v1 by id1:id2,100,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.125,0.146,TRUE,29998597,0.0,,False +ip-172-31-31-147,1701270373,1701270439.9478836,groupby,G1_1e7_1e1_0_0,10000000,sum v1 mean v3 by id3,999951,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,1.704,0.19,TRUE,29998597;50000558.524,0.003,,False +ip-172-31-31-147,1701270373,1701270441.6072276,groupby,G1_1e7_1e1_0_0,10000000,sum v1 mean v3 by id3,999951,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,1.623,0.249,TRUE,29998597;50000558.524,0.002,,False +ip-172-31-31-147,1701270373,1701270441.739028,groupby,G1_1e7_1e1_0_0,10000000,mean v1:v3 by id4,10,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.091,0.25,TRUE,29.999;79.992;499.981,0.0,,False +ip-172-31-31-147,1701270373,1701270441.8600256,groupby,G1_1e7_1e1_0_0,10000000,mean v1:v3 by id4,10,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.087,0.25,TRUE,29.999;79.992;499.981,0.0,,False +ip-172-31-31-147,1701270373,1701270442.7412753,groupby,G1_1e7_1e1_0_0,10000000,sum v1:v3 by id6,999965,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.839,0.25,TRUE,29998597;79991898;499980747.01,0.003,,False +ip-172-31-31-147,1701270373,1701270443.6023772,groupby,G1_1e7_1e1_0_0,10000000,sum v1:v3 by id6,999965,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.825,0.25,TRUE,29998597;79991898;499980747.01,0.003,,False +ip-172-31-31-147,1701270373,1701270445.99507,groupby,G1_1e7_1e1_0_0,10000000,median v3 sd v3 by id4 id5,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.35,0.25,TRUE,4999.573;2887.162,0.001,,False +ip-172-31-31-147,1701270373,1701270448.3552601,groupby,G1_1e7_1e1_0_0,10000000,median v3 sd v3 by id4 id5,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.325,0.251,TRUE,4999.573;2887.162,0.001,,False +ip-172-31-31-147,1701270373,1701270449.8542595,groupby,G1_1e7_1e1_0_0,10000000,max v1 - min v2 by id3,999951,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,1.457,0.266,TRUE,2789316,0.001,,False +ip-172-31-31-147,1701270373,1701270451.3055725,groupby,G1_1e7_1e1_0_0,10000000,max v1 - min v2 by id3,999951,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,1.414,0.266,TRUE,2789316,0.001,,False +ip-172-31-31-147,1701270373,1701270553.2851121,groupby,G1_1e7_1e1_0_0,10000000,largest two v3 by id6,1999500,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,101.937,0.406,TRUE,170016562.642,0.003,,False +ip-172-31-31-147,1701270373,1701270654.7354357,groupby,G1_1e7_1e1_0_0,10000000,largest two v3 by id6,1999500,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,101.393,0.406,TRUE,170016562.642,0.003,,False +ip-172-31-31-147,1701270373,1701270656.277447,groupby,G1_1e7_1e1_0_0,10000000,regression v1 v2 by id2 id4,100,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,1.484,0.252,TRUE,0.001,0.0,,False +ip-172-31-31-147,1701270373,1701270657.756136,groupby,G1_1e7_1e1_0_0,10000000,regression v1 v2 by id2 id4,100,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,1.442,0.252,TRUE,0.001,0.0,,False +ip-172-31-31-147,1701270373,1701270676.3039303,groupby,G1_1e7_1e1_0_0,10000000,sum v3 count by id1:id6,10000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,18.487,1.441,TRUE,499980747.01;10000000,0.018,,False +ip-172-31-31-147,1701270373,1701270695.8785276,groupby,G1_1e7_1e1_0_0,10000000,sum v3 count by id1:id6,10000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,19.482,1.463,TRUE,499980747.01;10000000,0.018,,False +ip-172-31-31-147,1701270373,1701270714.5591867,groupby,G1_1e7_2e0_0_0,10000000,sum v1 by id1,2,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.09,0.146,TRUE,30000054,0.0,,False +ip-172-31-31-147,1701270373,1701270714.6752782,groupby,G1_1e7_2e0_0_0,10000000,sum v1 by id1,2,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.083,0.146,TRUE,30000054,0.0,,False +ip-172-31-31-147,1701270373,1701270714.8593626,groupby,G1_1e7_2e0_0_0,10000000,sum v1 by id1:id2,4,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.145,0.146,TRUE,30000054,0.0,,False +ip-172-31-31-147,1701270373,1701270715.0414004,groupby,G1_1e7_2e0_0_0,10000000,sum v1 by id1:id2,4,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.148,0.146,TRUE,30000054,0.0,,False +ip-172-31-31-147,1701270373,1701270719.2378724,groupby,G1_1e7_2e0_0_0,10000000,sum v1 mean v3 by id3,4323566,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,4.143,0.275,TRUE,30000054;216107547.389,0.013,,False +ip-172-31-31-147,1701270373,1701270723.387519,groupby,G1_1e7_2e0_0_0,10000000,sum v1 mean v3 by id3,4323566,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,4.084,0.295,TRUE,30000054;216107547.389,0.012,,False +ip-172-31-31-147,1701270373,1701270723.5407832,groupby,G1_1e7_2e0_0_0,10000000,mean v1:v3 by id4,2,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.099,0.183,TRUE,6.0;15.997;99.987,0.0,,False +ip-172-31-31-147,1701270373,1701270723.6719239,groupby,G1_1e7_2e0_0_0,10000000,mean v1:v3 by id4,2,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.095,0.183,TRUE,6.0;15.997;99.987,0.0,,False +ip-172-31-31-147,1701270373,1701270725.8696494,groupby,G1_1e7_2e0_0_0,10000000,sum v1:v3 by id6,4322014,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.138,0.28,TRUE,30000054;79986418;499936032.106,0.017,,False +ip-172-31-31-147,1701270373,1701270728.0564108,groupby,G1_1e7_2e0_0_0,10000000,sum v1:v3 by id6,4322014,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.13,0.28,TRUE,30000054;79986418;499936032.106,0.016,,False +ip-172-31-31-147,1701270373,1701270730.4869003,groupby,G1_1e7_2e0_0_0,10000000,median v3 sd v3 by id4 id5,4,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.372,0.184,TRUE,199.97;115.489,0.001,,False +ip-172-31-31-147,1701270373,1701270732.9899151,groupby,G1_1e7_2e0_0_0,10000000,median v3 sd v3 by id4 id5,4,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.464,0.184,TRUE,199.97;115.489,0.001,,False +ip-172-31-31-147,1701270373,1701270736.615203,groupby,G1_1e7_2e0_0_0,10000000,max v1 - min v2 by id3,4323566,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,3.575,0.266,TRUE,-8263086,0.004,,False +ip-172-31-31-147,1701270373,1701270740.2500696,groupby,G1_1e7_2e0_0_0,10000000,max v1 - min v2 by id3,4323566,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,3.578,0.266,TRUE,-8263086,0.004,,False +ip-172-31-31-147,1701270373,1701271148.4950092,groupby,G1_1e7_2e0_0_0,10000000,largest two v3 by id6,7291480,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,408.177,0.26,TRUE,419079607.0,0.01,,False +ip-172-31-31-147,1701270373,1701271554.5266051,groupby,G1_1e7_2e0_0_0,10000000,largest two v3 by id6,7291480,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,405.965,0.313,TRUE,419079607.0,0.01,,False +ip-172-31-31-147,1701270373,1701271557.4972403,groupby,G1_1e7_2e0_0_0,10000000,regression v1 v2 by id2 id4,4,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.903,0.21,TRUE,0.0,0.0,,False +ip-172-31-31-147,1701270373,1701271560.4159844,groupby,G1_1e7_2e0_0_0,10000000,regression v1 v2 by id2 id4,4,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.871,0.21,TRUE,0.0,0.0,,False +ip-172-31-31-147,1701270373,1701271612.7523394,groupby,G1_1e7_2e0_0_0,10000000,sum v3 count by id1:id6,10000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,52.265,1.436,TRUE,499936032.106;10000000,0.018,,False +ip-172-31-31-147,1701270373,1701271664.2474732,groupby,G1_1e7_2e0_0_0,10000000,sum v3 count by id1:id6,10000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,51.396,1.478,TRUE,499936032.106;10000000,0.018,,False +ip-172-31-31-147,1701270373,1701271682.8363354,groupby,G1_1e7_1e2_0_1,10000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.072,0.143,TRUE,29998789,0.0,,False +ip-172-31-31-147,1701270373,1701271682.9395196,groupby,G1_1e7_1e2_0_1,10000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.07,0.143,TRUE,29998789,0.0,,False +ip-172-31-31-147,1701270373,1701271683.109577,groupby,G1_1e7_1e2_0_1,10000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.132,0.145,TRUE,29998789,0.0,,False +ip-172-31-31-147,1701270373,1701271683.2747264,groupby,G1_1e7_1e2_0_1,10000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.131,0.146,TRUE,29998789,0.0,,False +ip-172-31-31-147,1701270373,1701271683.6124837,groupby,G1_1e7_1e2_0_1,10000000,sum v1 mean v3 by id3,100000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.299,0.153,TRUE,29998789;4999719.622,0.0,,False +ip-172-31-31-147,1701270373,1701271683.9482932,groupby,G1_1e7_1e2_0_1,10000000,sum v1 mean v3 by id3,100000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.301,0.154,TRUE,29998789;4999719.622,0.0,,False +ip-172-31-31-147,1701270373,1701271684.0828857,groupby,G1_1e7_1e2_0_1,10000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.095,0.148,TRUE,299.988;799.894;4999.767,0.0,,False +ip-172-31-31-147,1701270373,1701271684.2127635,groupby,G1_1e7_1e2_0_1,10000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.096,0.148,TRUE,299.988;799.894;4999.767,0.0,,False +ip-172-31-31-147,1701270373,1701271684.4555683,groupby,G1_1e7_1e2_0_1,10000000,sum v1:v3 by id6,100000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.203,0.153,TRUE,29998789;79989360;499976651.408,0.001,,False +ip-172-31-31-147,1701270373,1701271684.6869807,groupby,G1_1e7_1e2_0_1,10000000,sum v1:v3 by id6,100000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.197,0.153,TRUE,29998789;79989360;499976651.408,0.001,,False +ip-172-31-31-147,1701270373,1701271687.291535,groupby,G1_1e7_1e2_0_1,10000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.56,0.155,TRUE,499920.14;288648.108,0.001,,False +ip-172-31-31-147,1701270373,1701271689.876269,groupby,G1_1e7_1e2_0_1,10000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.55,0.155,TRUE,499920.14;288648.108,0.001,,False +ip-172-31-31-147,1701270373,1701271690.187954,groupby,G1_1e7_1e2_0_1,10000000,max v1 - min v2 by id3,100000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.263,0.155,TRUE,399882,0.0,,False +ip-172-31-31-147,1701270373,1701271690.4806156,groupby,G1_1e7_1e2_0_1,10000000,max v1 - min v2 by id3,100000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.258,0.155,TRUE,399882,0.0,,False +ip-172-31-31-147,1701270373,1701271701.6402454,groupby,G1_1e7_1e2_0_1,10000000,largest two v3 by id6,200000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,11.121,0.169,TRUE,19700450.588,0.0,,False +ip-172-31-31-147,1701270373,1701271712.701363,groupby,G1_1e7_1e2_0_1,10000000,largest two v3 by id6,200000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,11.025,0.173,TRUE,19700450.588,0.0,,False +ip-172-31-31-147,1701270373,1701271714.142669,groupby,G1_1e7_1e2_0_1,10000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,1.401,0.159,TRUE,9.839,0.0,,False +ip-172-31-31-147,1701270373,1701271715.6366665,groupby,G1_1e7_1e2_0_1,10000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,1.457,0.159,TRUE,9.839,0.0,,False +ip-172-31-31-147,1701270373,1701271720.089303,groupby,G1_1e7_1e2_0_1,10000000,sum v3 count by id1:id6,10000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,4.392,0.89,TRUE,499976651.408;10000000,0.019,,False +ip-172-31-31-147,1701270373,1701271724.488934,groupby,G1_1e7_1e2_0_1,10000000,sum v3 count by id1:id6,10000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,4.325,1.137,TRUE,499976651.408;10000000,0.018,,False +ip-172-31-31-147,1701270373,1701271765.6013942,groupby,G1_1e8_1e2_0_0,100000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.206,0.146,TRUE,299991302,0.0,,False +ip-172-31-31-147,1701270373,1701271765.8140488,groupby,G1_1e8_1e2_0_0,100000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.175,0.146,TRUE,299991302,0.0,,False +ip-172-31-31-147,1701270373,1701271766.2239163,groupby,G1_1e8_1e2_0_0,100000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.367,0.148,TRUE,299991302,0.0,,False +ip-172-31-31-147,1701270373,1701271766.6120996,groupby,G1_1e8_1e2_0_0,100000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.349,0.148,TRUE,299991302,0.0,,False +ip-172-31-31-147,1701270373,1701271772.4396286,groupby,G1_1e8_1e2_0_0,100000000,sum v1 mean v3 by id3,1000000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,5.777,0.193,TRUE,299991302;50001192.355,0.004,,False +ip-172-31-31-147,1701270373,1701271778.0148604,groupby,G1_1e8_1e2_0_0,100000000,sum v1 mean v3 by id3,1000000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,5.53,0.252,TRUE,299991302;50001192.355,0.004,,False +ip-172-31-31-147,1701270373,1701271778.448956,groupby,G1_1e8_1e2_0_0,100000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.377,0.253,TRUE,299.991;799.978;5000.104,0.001,,False +ip-172-31-31-147,1701270373,1701271778.7874477,groupby,G1_1e8_1e2_0_0,100000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.295,0.253,TRUE,299.991;799.978;5000.104,0.0,,False +ip-172-31-31-147,1701270373,1701271781.9041739,groupby,G1_1e8_1e2_0_0,100000000,sum v1:v3 by id6,1000000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,3.065,0.253,TRUE,299991302;799978221;5000103937.772,0.004,,False +ip-172-31-31-147,1701270373,1701271785.090914,groupby,G1_1e8_1e2_0_0,100000000,sum v1:v3 by id6,1000000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,3.139,0.253,TRUE,299991302;799978221;5000103937.772,0.004,,False +ip-172-31-31-147,1701270373,1701271789.215396,groupby,G1_1e8_1e2_0_0,100000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,4.072,0.254,TRUE,500019.998;288668.357,0.001,,False +ip-172-31-31-147,1701270373,1701271793.481325,groupby,G1_1e8_1e2_0_0,100000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,4.217,0.255,TRUE,500019.998;288668.357,0.002,,False +ip-172-31-31-147,1701270373,1701271798.4848406,groupby,G1_1e8_1e2_0_0,100000000,max v1 - min v2 by id3,1000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,4.932,0.27,TRUE,3998729,0.001,,False +ip-172-31-31-147,1701270373,1701271803.1239278,groupby,G1_1e8_1e2_0_0,100000000,max v1 - min v2 by id3,1000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,4.587,0.27,TRUE,3998729,0.002,,False +ip-172-31-31-147,1701270373,1701271866.7611928,groupby,G1_1e8_1e2_0_0,100000000,largest two v3 by id6,2000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,63.586,0.419,TRUE,196996660.391,0.003,,False +ip-172-31-31-147,1701270373,1701271933.098539,groupby,G1_1e8_1e2_0_0,100000000,largest two v3 by id6,2000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,66.262,0.422,TRUE,196996660.391,0.003,,False +ip-172-31-31-147,1701270373,1701271939.001758,groupby,G1_1e8_1e2_0_0,100000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,5.823,0.249,TRUE,1.007,0.0,,False +ip-172-31-31-147,1701270373,1701271944.2145886,groupby,G1_1e8_1e2_0_0,100000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,5.156,0.251,TRUE,1.007,0.0,,False +ip-172-31-31-147,1701270373,1701272135.447855,groupby,G1_1e8_1e2_0_0,100000000,sum v3 count by id1:id6,100000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,190.979,12.119,TRUE,5000103937.772;100000000,0.178,,False +ip-172-31-31-147,1701270373,1701272321.6028104,groupby,G1_1e8_1e2_0_0,100000000,sum v3 count by id1:id6,100000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,185.667,12.438,TRUE,5000103937.772;100000000,0.171,,False +ip-172-31-31-147,1701270373,1701272347.1373367,groupby,G1_1e8_1e1_0_0,100000000,sum v1 by id1,10,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.199,0.145,TRUE,300012466,0.0,,False +ip-172-31-31-147,1701270373,1701272347.352836,groupby,G1_1e8_1e1_0_0,100000000,sum v1 by id1,10,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.173,0.145,TRUE,300012466,0.0,,False +ip-172-31-31-147,1701270373,1701272347.696981,groupby,G1_1e8_1e1_0_0,100000000,sum v1 by id1:id2,100,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.303,0.145,TRUE,300012466,0.0,,False +ip-172-31-31-147,1701270373,1701272348.0351849,groupby,G1_1e8_1e1_0_0,100000000,sum v1 by id1:id2,100,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.3,0.146,TRUE,300012466,0.0,,False +ip-172-31-31-147,1701270373,1701272380.678586,groupby,G1_1e8_1e1_0_0,100000000,sum v1 mean v3 by id3,9999602,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,32.576,0.446,TRUE,300012466;499941400.876,0.023,,False +ip-172-31-31-147,1701270373,1701272412.917558,groupby,G1_1e8_1e1_0_0,100000000,sum v1 mean v3 by id3,9999602,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,32.137,0.448,TRUE,300012466;499941400.876,0.022,,False +ip-172-31-31-147,1701270373,1701272413.4411626,groupby,G1_1e8_1e1_0_0,100000000,mean v1:v3 by id4,10,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.44,0.159,TRUE,30.001;80.008;499.958,0.0,,False +ip-172-31-31-147,1701270373,1701272413.803133,groupby,G1_1e8_1e1_0_0,100000000,mean v1:v3 by id4,10,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.319,0.159,TRUE,30.001;80.008;499.958,0.0,,False +ip-172-31-31-147,1701270373,1701272431.17442,groupby,G1_1e8_1e1_0_0,100000000,sum v1:v3 by id6,9999538,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,17.295,0.458,TRUE,300012466;800079612;4999575436.012,0.028,,False +ip-172-31-31-147,1701270373,1701272448.5870707,groupby,G1_1e8_1e1_0_0,100000000,sum v1:v3 by id6,9999538,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,17.314,0.46,TRUE,300012466;800079612;4999575436.012,0.027,,False +ip-172-31-31-147,1701270373,1701272453.278708,groupby,G1_1e8_1e1_0_0,100000000,median v3 sd v3 by id4 id5,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,4.626,0.162,TRUE,4999.826;2886.819,0.001,,False +ip-172-31-31-147,1701270373,1701272457.872118,groupby,G1_1e8_1e1_0_0,100000000,median v3 sd v3 by id4 id5,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,4.534,0.163,TRUE,4999.826;2886.819,0.001,,False +ip-172-31-31-147,1701270373,1701272488.1121628,groupby,G1_1e8_1e1_0_0,100000000,max v1 - min v2 by id3,9999602,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,30.176,0.388,TRUE,27890093,0.007,,False +ip-172-31-31-147,1701270373,1701272515.124372,groupby,G1_1e8_1e1_0_0,100000000,max v1 - min v2 by id3,9999602,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,26.928,0.389,TRUE,27890093,0.007,,False +ip-172-31-31-147,1701270373,1701273055.48205,groupby,G1_1e8_1e1_0_0,100000000,largest two v3 by id6,19994518,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,540.245,0.959,TRUE,1700010092.167,0.024,,False +ip-172-31-31-147,1701270373,1701273588.4104173,groupby,G1_1e8_1e1_0_0,100000000,largest two v3 by id6,19994518,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,532.832,1.042,TRUE,1700010092.167,0.024,,False +ip-172-31-31-147,1701270373,1701273593.750396,groupby,G1_1e8_1e1_0_0,100000000,regression v1 v2 by id2 id4,100,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,5.261,0.745,TRUE,0.0,0.0,,False +ip-172-31-31-147,1701270373,1701273598.922429,groupby,G1_1e8_1e1_0_0,100000000,regression v1 v2 by id2 id4,100,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,5.113,0.747,TRUE,0.0,0.0,,False +ip-172-31-31-147,1701270373,1701279562.9120233,groupby,G1_1e8_2e0_0_0,100000000,sum v1 by id1,2,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.184,0.147,TRUE,299988126,0.0,,False +ip-172-31-31-147,1701270373,1701279563.1294274,groupby,G1_1e8_2e0_0_0,100000000,sum v1 by id1,2,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.178,0.147,TRUE,299988126,0.0,,False +ip-172-31-31-147,1701270373,1701279563.4639542,groupby,G1_1e8_2e0_0_0,100000000,sum v1 by id1:id2,4,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.295,0.148,TRUE,299988126,0.0,,False +ip-172-31-31-147,1701270373,1701279563.7826111,groupby,G1_1e8_2e0_0_0,100000000,sum v1 by id1:id2,4,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.283,0.148,TRUE,299988126,0.0,,False +ip-172-31-31-147,1701270373,1701279627.0953436,groupby,G1_1e8_2e0_0_0,100000000,sum v1 mean v3 by id3,43233017,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,63.135,1.44,TRUE,299988126;2161776167.331,0.134,,False +ip-172-31-31-147,1701270373,1701279690.6538594,groupby,G1_1e8_2e0_0_0,100000000,sum v1 mean v3 by id3,43233017,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,63.251,1.442,TRUE,299988126;2161776167.331,0.131,,False +ip-172-31-31-147,1701270373,1701279691.9334166,groupby,G1_1e8_2e0_0_0,100000000,mean v1:v3 by id4,2,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,1.091,0.153,TRUE,6.0;15.999;100.001,0.0,,False +ip-172-31-31-147,1701270373,1701279692.2921662,groupby,G1_1e8_2e0_0_0,100000000,mean v1:v3 by id4,2,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.319,0.153,TRUE,6.0;15.999;100.001,0.0,,False +ip-172-31-31-147,1701270373,1701279730.054133,groupby,G1_1e8_2e0_0_0,100000000,sum v1:v3 by id6,43238066,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,37.547,1.443,TRUE,299988126;799952220;5000051370.457,0.17,,False +ip-172-31-31-147,1701270373,1701279767.4784524,groupby,G1_1e8_2e0_0_0,100000000,sum v1:v3 by id6,43238066,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,37.112,1.443,TRUE,299988126;799952220;5000051370.457,0.17,,False +ip-172-31-31-147,1701270373,1701279774.8029108,groupby,G1_1e8_2e0_0_0,100000000,median v3 sd v3 by id4 id5,4,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,7.179,0.154,TRUE,199.998;115.468,0.001,,False +ip-172-31-31-147,1701270373,1701279781.85363,groupby,G1_1e8_2e0_0_0,100000000,median v3 sd v3 by id4 id5,4,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,7.009,0.155,TRUE,199.998;115.468,0.001,,False +ip-172-31-31-147,1701270373,1701279834.7651896,groupby,G1_1e8_2e0_0_0,100000000,max v1 - min v2 by id3,43233017,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,52.82,1.125,TRUE,-82715914,0.04,,False +ip-172-31-31-147,1701270373,1701279889.3444054,groupby,G1_1e8_2e0_0_0,100000000,max v1 - min v2 by id3,43233017,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,54.378,1.126,TRUE,-82715914,0.041,,False +ip-172-31-31-147,1701270373,1701286797.1623476,groupby,G1_1e8_1e2_0_1,100000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.192,0.146,TRUE,299991302,0.0,,False +ip-172-31-31-147,1701270373,1701286797.3755906,groupby,G1_1e8_1e2_0_1,100000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.173,0.146,TRUE,299991302,0.0,,False +ip-172-31-31-147,1701270373,1701286797.7259943,groupby,G1_1e8_1e2_0_1,100000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.307,0.147,TRUE,299991302,0.0,,False +ip-172-31-31-147,1701270373,1701286798.0494366,groupby,G1_1e8_1e2_0_1,100000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.286,0.147,TRUE,299991302,0.0,,False +ip-172-31-31-147,1701270373,1701286803.5212162,groupby,G1_1e8_1e2_0_1,100000000,sum v1 mean v3 by id3,1000000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,5.423,0.192,TRUE,299991302;50001192.355,0.003,,False +ip-172-31-31-147,1701270373,1701286809.0156846,groupby,G1_1e8_1e2_0_1,100000000,sum v1 mean v3 by id3,1000000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,5.451,0.252,TRUE,299991302;50001192.355,0.003,,False +ip-172-31-31-147,1701270373,1701286809.4124792,groupby,G1_1e8_1e2_0_1,100000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,0.341,0.253,TRUE,299.991;799.978;5000.104,0.0,,False +ip-172-31-31-147,1701270373,1701286809.7314208,groupby,G1_1e8_1e2_0_1,100000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,0.278,0.253,TRUE,299.991;799.978;5000.104,0.0,,False +ip-172-31-31-147,1701270373,1701286812.7587445,groupby,G1_1e8_1e2_0_1,100000000,sum v1:v3 by id6,1000000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.977,0.254,TRUE,299991302;799978221;5000103937.772,0.004,,False +ip-172-31-31-147,1701270373,1701286815.7217731,groupby,G1_1e8_1e2_0_1,100000000,sum v1:v3 by id6,1000000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.917,0.254,TRUE,299991302;799978221;5000103937.772,0.004,,False +ip-172-31-31-147,1701270373,1701286819.8786478,groupby,G1_1e8_1e2_0_1,100000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,4.109,0.255,TRUE,500019.998;288668.357,0.001,,False +ip-172-31-31-147,1701270373,1701286823.9602194,groupby,G1_1e8_1e2_0_1,100000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,4.035,0.255,TRUE,500019.998;288668.357,0.001,,False +ip-172-31-31-147,1701270373,1701286828.7490864,groupby,G1_1e8_1e2_0_1,100000000,max v1 - min v2 by id3,1000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,4.719,0.269,TRUE,3998729,0.001,,False +ip-172-31-31-147,1701270373,1701286833.2757466,groupby,G1_1e8_1e2_0_1,100000000,max v1 - min v2 by id3,1000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,4.48,0.269,TRUE,3998729,0.001,,False +ip-172-31-31-147,1701270373,1701286899.933484,groupby,G1_1e8_1e2_0_1,100000000,largest two v3 by id6,2000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,66.59,0.416,TRUE,196996660.391,0.003,,False +ip-172-31-31-147,1701270373,1701286964.0528538,groupby,G1_1e8_1e2_0_1,100000000,largest two v3 by id6,2000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,64.045,0.418,TRUE,196996660.391,0.003,,False +ip-172-31-31-147,1701270373,1701286968.5076516,groupby,G1_1e8_1e2_0_1,100000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,4.375,0.246,TRUE,1.007,0.0,,False +ip-172-31-31-147,1701270373,1701286972.9494538,groupby,G1_1e8_1e2_0_1,100000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,4.372,0.248,TRUE,1.007,0.0,,False +ip-172-31-31-147,1701270373,1701287165.7388566,groupby,G1_1e8_1e2_0_1,100000000,sum v3 count by id1:id6,100000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,192.555,11.876,TRUE,5000103937.772;100000000,0.172,,False +ip-172-31-31-147,1701270373,1701287349.8430133,groupby,G1_1e8_1e2_0_1,100000000,sum v3 count by id1:id6,100000000,8,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,183.614,12.752,TRUE,5000103937.772;100000000,0.173,,False +ip-172-31-31-147,1701270373,1701287441.070584,groupby,G1_1e9_1e2_0_0,1000000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,1.526,0.159,TRUE,2999924714,0.0,,False +ip-172-31-31-147,1701270373,1701287442.2304997,groupby,G1_1e9_1e2_0_0,1000000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,1.104,0.162,TRUE,2999924714,0.0,,False +ip-172-31-31-147,1701270373,1701287444.9488778,groupby,G1_1e9_1e2_0_0,1000000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.662,0.166,TRUE,2999924714,0.001,,False +ip-172-31-31-147,1701270373,1701287447.4995255,groupby,G1_1e9_1e2_0_0,1000000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.481,0.168,TRUE,2999924714,0.0,,False +ip-172-31-31-147,1701270373,1701287628.253746,groupby,G1_1e9_1e2_0_0,1000000000,sum v1 mean v3 by id3,10000000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,180.651,0.472,TRUE,2999924714;499986249.525,0.031,,False +ip-172-31-31-147,1701270373,1701287868.545322,groupby,G1_1e9_1e2_0_0,1000000000,sum v1 mean v3 by id3,10000000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,240.14,0.477,TRUE,2999924714;499986249.525,0.032,,False +ip-172-31-31-147,1701270373,1701287878.5984879,groupby,G1_1e9_1e2_0_0,1000000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,9.912,0.191,TRUE,299.992;799.999;4999.87,0.0,,False +ip-172-31-31-147,1701270373,1701287881.5974123,groupby,G1_1e9_1e2_0_0,1000000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.909,0.195,TRUE,299.992;799.999;4999.87,0.0,,False +ip-172-31-31-147,1701270373,1701287933.6493325,groupby,G1_1e9_1e2_0_0,1000000000,sum v1:v3 by id6,10000000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,51.903,0.497,TRUE,2999924714;7999992854;49998699477.823,0.038,,False +ip-172-31-31-147,1701270373,1701287984.4997504,groupby,G1_1e9_1e2_0_0,1000000000,sum v1:v3 by id6,10000000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,50.687,0.501,TRUE,2999924714;7999992854;49998699477.823,0.029,,False +ip-172-31-31-147,1701270373,1701288052.1076963,groupby,G1_1e9_1e2_0_0,1000000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,67.441,0.216,TRUE,499981.751;288669.152,0.002,,False +ip-172-31-31-147,1701270373,1701288117.1575553,groupby,G1_1e9_1e2_0_0,1000000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,64.918,0.232,TRUE,499981.751;288669.152,0.001,,False +ip-172-31-31-147,1701270373,1701288286.0172052,groupby,G1_1e9_1e2_0_0,1000000000,max v1 - min v2 by id3,10000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,168.721,0.458,TRUE,39987226,0.01,,False +ip-172-31-31-147,1701270373,1701288452.3057954,groupby,G1_1e9_1e2_0_0,1000000000,max v1 - min v2 by id3,10000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,166.108,0.46,TRUE,39987226,0.01,,False +ip-172-31-31-147,1701270373,1701289103.1724594,groupby,G1_1e9_1e2_0_0,1000000000,largest two v3 by id6,20000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,650.675,1.21,TRUE,1970001789.633,0.024,,False +ip-172-31-31-147,1701270373,1701289732.0969014,groupby,G1_1e9_1e2_0_0,1000000000,largest two v3 by id6,20000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,628.72,1.199,TRUE,1970001789.633,0.024,,False +ip-172-31-31-147,1701270373,1701289950.0646155,groupby,G1_1e9_1e2_0_0,1000000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,217.772,0.72,TRUE,0.098,0.0,,False +ip-172-31-31-147,1701270373,1701290168.7458365,groupby,G1_1e9_1e2_0_0,1000000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,218.409,0.721,TRUE,0.098,0.0,,False +ip-172-31-31-147,1701270373,1701298256.1023448,groupby,G1_1e9_1e1_0_0,1000000000,sum v1 by id1,10,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,1.598,0.16,TRUE,2999933732,0.0,,False +ip-172-31-31-147,1701270373,1701298257.3023472,groupby,G1_1e9_1e1_0_0,1000000000,sum v1 by id1,10,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,1.123,0.162,TRUE,2999933732,0.001,,False +ip-172-31-31-147,1701270373,1701298259.6293845,groupby,G1_1e9_1e1_0_0,1000000000,sum v1 by id1:id2,100,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.224,0.165,TRUE,2999933732,0.0,,False +ip-172-31-31-147,1701270373,1701298261.7867439,groupby,G1_1e9_1e1_0_0,1000000000,sum v1 by id1:id2,100,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.093,0.169,TRUE,2999933732,0.0,,False +ip-172-31-31-147,1701270373,1701309070.1026845,groupby,G1_1e9_2e0_0_0,1000000000,sum v1 by id1,2,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,1.304,0.159,TRUE,2999997259,0.0,,False +ip-172-31-31-147,1701270373,1701309071.2744927,groupby,G1_1e9_2e0_0_0,1000000000,sum v1 by id1,2,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,1.112,0.162,TRUE,2999997259,0.0,,False +ip-172-31-31-147,1701270373,1701309073.4942474,groupby,G1_1e9_2e0_0_0,1000000000,sum v1 by id1:id2,4,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.149,0.165,TRUE,2999997259,0.0,,False +ip-172-31-31-147,1701270373,1701309075.7645607,groupby,G1_1e9_2e0_0_0,1000000000,sum v1 by id1:id2,4,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.202,0.168,TRUE,2999997259,0.0,,False +ip-172-31-31-147,1701270373,1701319887.266315,groupby,G1_1e9_1e2_0_1,1000000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,1.37,0.162,TRUE,2999924714,0.0,,False +ip-172-31-31-147,1701270373,1701319888.4063828,groupby,G1_1e9_1e2_0_1,1000000000,sum v1 by id1,100,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,1.084,0.166,TRUE,2999924714,0.0,,False +ip-172-31-31-147,1701270373,1701319890.917585,groupby,G1_1e9_1e2_0_1,1000000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,2.457,0.169,TRUE,2999924714,0.0,,False +ip-172-31-31-147,1701270373,1701319893.3536305,groupby,G1_1e9_1e2_0_1,1000000000,sum v1 by id1:id2,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.37,0.172,TRUE,2999924714,0.0,,False +ip-172-31-31-147,1701270373,1701320072.757093,groupby,G1_1e9_1e2_0_1,1000000000,sum v1 mean v3 by id3,10000000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,179.307,0.476,TRUE,2999924714;499986249.525,0.032,,False +ip-172-31-31-147,1701270373,1701320223.710797,groupby,G1_1e9_1e2_0_1,1000000000,sum v1 mean v3 by id3,10000000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,150.808,0.481,TRUE,2999924714;499986249.525,0.031,,False +ip-172-31-31-147,1701270373,1701320247.6839767,groupby,G1_1e9_1e2_0_1,1000000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,23.845,0.196,TRUE,299.992;799.999;4999.87,0.0,,False +ip-172-31-31-147,1701270373,1701320250.1565726,groupby,G1_1e9_1e2_0_1,1000000000,mean v1:v3 by id4,100,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,2.392,0.199,TRUE,299.992;799.999;4999.87,0.0,,False +ip-172-31-31-147,1701270373,1701320316.5739267,groupby,G1_1e9_1e2_0_1,1000000000,sum v1:v3 by id6,10000000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,66.3,0.5,TRUE,2999924714;7999992854;49998699477.823,0.038,,False +ip-172-31-31-147,1701270373,1701320374.1152494,groupby,G1_1e9_1e2_0_1,1000000000,sum v1:v3 by id6,10000000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,57.394,0.503,TRUE,2999924714;7999992854;49998699477.823,0.028,,False +ip-172-31-31-147,1701270373,1701320442.3258793,groupby,G1_1e9_1e2_0_1,1000000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,68.084,0.219,TRUE,499981.751;288669.152,0.002,,False +ip-172-31-31-147,1701270373,1701320507.3586907,groupby,G1_1e9_1e2_0_1,1000000000,median v3 sd v3 by id4 id5,10000,4,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,64.9,0.233,TRUE,499981.751;288669.152,0.002,,False +ip-172-31-31-147,1701270373,1701320627.4618857,groupby,G1_1e9_1e2_0_1,1000000000,max v1 - min v2 by id3,10000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,119.926,0.456,TRUE,39987226,0.007,,False +ip-172-31-31-147,1701270373,1701320770.1905253,groupby,G1_1e9_1e2_0_1,1000000000,max v1 - min v2 by id3,10000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,142.538,0.458,TRUE,39987226,0.007,,False +ip-172-31-31-147,1701270373,1701321406.5192816,groupby,G1_1e9_1e2_0_1,1000000000,largest two v3 by id6,20000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,636.128,1.21,TRUE,1970001789.633,0.023,,False +ip-172-31-31-147,1701270373,1701322023.264214,groupby,G1_1e9_1e2_0_1,1000000000,largest two v3 by id6,20000000,2,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,616.519,1.183,TRUE,1970001789.633,0.024,,False +ip-172-31-31-147,1701270373,1701322228.1617336,groupby,G1_1e9_1e2_0_1,1000000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,1,204.648,0.889,TRUE,0.098,0.0,,False +ip-172-31-31-147,1701270373,1701322434.9540725,groupby,G1_1e9_1e2_0_1,1000000000,regression v1 v2 by id2 id4,10000,3,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.groupby,2,206.537,0.888,TRUE,0.098,0.0,,False +ip-172-31-31-147,1701270373,1701330673.969423,join,J1_1e7_NA_0_0,10000000,small inner on int,8998860,9,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,1,0.418,3.259,TRUE,450015153.577;347720187.395,0.023,,False +ip-172-31-31-147,1701270373,1701330674.3800976,join,J1_1e7_NA_0_0,10000000,small inner on int,8998860,9,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,2,0.367,3.428,TRUE,450015153.577;347720187.395,0.023,,False +ip-172-31-31-147,1701270373,1701330674.964119,join,J1_1e7_NA_0_0,10000000,medium inner on int,8998412,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,1,0.526,3.499,TRUE,449954076.026;449999844.937,0.022,,False +ip-172-31-31-147,1701270373,1701330675.514599,join,J1_1e7_NA_0_0,10000000,medium inner on int,8998412,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,2,0.508,3.481,TRUE,449954076.026;449999844.937,0.021,,False +ip-172-31-31-147,1701270373,1701330676.0076911,join,J1_1e7_NA_0_0,10000000,medium outer on int,10000000,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,1,0.402,3.531,TRUE,500043740.752;449999844.937,0.058,,False +ip-172-31-31-147,1701270373,1701330676.4824815,join,J1_1e7_NA_0_0,10000000,medium outer on int,10000000,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,2,0.395,3.456,TRUE,500043740.752;449999844.937,0.06,,False +ip-172-31-31-147,1701270373,1701330677.1112347,join,J1_1e7_NA_0_0,10000000,medium inner on factor,8998412,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,1,0.573,3.774,TRUE,449954076.026;449999844.937,0.021,,False +ip-172-31-31-147,1701270373,1701330677.7000687,join,J1_1e7_NA_0_0,10000000,medium inner on factor,8998412,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,2,0.548,3.842,TRUE,449954076.026;449999844.937,0.021,,False +ip-172-31-31-147,1701270373,1701330681.05543,join,J1_1e7_NA_0_0,10000000,big inner on int,9000000,13,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,1,3.301,6.838,TRUE,450032091.841;449860428.616,0.021,,False +ip-172-31-31-147,1701270373,1701330684.104251,join,J1_1e7_NA_0_0,10000000,big inner on int,9000000,13,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,2,3.007,7.982,TRUE,450032091.841;449860428.616,0.021,,False +ip-172-31-31-147,1701270373,1701330842.793844,join,J1_1e8_NA_0_0,100000000,small inner on int,89997128,9,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,1,3.189,29.063,TRUE,4499430832.39;4388703871.229,0.239,,False +ip-172-31-31-147,1701270373,1701330845.8377287,join,J1_1e8_NA_0_0,100000000,small inner on int,89997128,9,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,2,2.793,30.591,TRUE,4499430832.39;4388703871.229,0.221,,False +ip-172-31-31-147,1701270373,1701330849.3190267,join,J1_1e8_NA_0_0,100000000,medium inner on int,89995511,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,1,3.211,32.915,TRUE,4499423746.365;4507751463.252,0.22,,False +ip-172-31-31-147,1701270373,1701330852.6214228,join,J1_1e8_NA_0_0,100000000,medium inner on int,89995511,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,2,3.039,33.698,TRUE,4499423746.365;4507751463.252,0.228,,False +ip-172-31-31-147,1701270373,1701330856.895314,join,J1_1e8_NA_0_0,100000000,medium outer on int,100000000,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,1,3.425,31.78,TRUE,4999542477.919;4507751463.252,0.794,,False +ip-172-31-31-147,1701270373,1701330861.6940563,join,J1_1e8_NA_0_0,100000000,medium outer on int,100000000,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,2,3.974,32.036,TRUE,4999542477.919;4507751463.252,0.77,,False +ip-172-31-31-147,1701270373,1701330865.5744674,join,J1_1e8_NA_0_0,100000000,medium inner on factor,89995511,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,1,3.599,36.495,TRUE,4499423746.365;4507751463.252,0.223,,False +ip-172-31-31-147,1701270373,1701330869.3830082,join,J1_1e8_NA_0_0,100000000,medium inner on factor,89995511,11,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,2,3.544,36.732,TRUE,4499423746.365;4507751463.252,0.222,,False +ip-172-31-31-147,1701270373,1701330945.315101,join,J1_1e8_NA_0_0,100000000,big inner on int,90000000,13,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,1,75.662,58.546,TRUE,4499590098.078;4499913694.243,0.21,,False +ip-172-31-31-147,1701270373,1701331022.3666933,join,J1_1e8_NA_0_0,100000000,big inner on int,90000000,13,dask,2023.10.0,3a8f8248884f8c69b76d610d8a44d9b6501d7a7a,.merge,2,76.789,63.037,TRUE,4499590098.078;4499913694.243,0.21,,False + From 56e4c746bdf2ffbce77b1872a534851df425afbd Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 6 Dec 2023 12:02:15 +0000 Subject: [PATCH 25/25] fix bug for reporting arrow as R-arrow --- _report/report.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_report/report.R b/_report/report.R index 29405a0d..35ef889f 100644 --- a/_report/report.R +++ b/_report/report.R @@ -71,7 +71,7 @@ clean_time = function(d) { old_advanced_groupby_questions = c("median v3 sd v3 by id2 id4","max v1 - min v2 by id2 id4","largest two v3 by id2 id4","regression v1 v2 by id2 id4","sum v3 count by id1:id6") # replace arrow with R-arrow (see https://github.com/duckdblabs/db-benchmark/pull/66) - d[which(solution == "arrow"),c("solution")] == "R-arrow" + d$solution[d$solution == "arrow"] <- "R-arrow" d[!nzchar(git), git := NA_character_ ][,"on_disk" := as.logical(on_disk) ][task=="groupby" & solution%in%c("pandas","dask","spark") & batch<1558106628, "out_cols" := NA_integer_