From a4bc5ee55d56a19ac834d64247e53e23fde10d82 Mon Sep 17 00:00:00 2001 From: Nick Robinson Date: Sat, 11 Nov 2023 21:44:06 +0000 Subject: [PATCH 01/22] Trigger a profile before terminating worker upon timeout --- src/ReTestItems.jl | 3 ++- src/workers.jl | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index b36bfbbe..d6139421 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -535,7 +535,8 @@ function manage_worker( # Handle the exception if e isa TimeoutException @debugv 1 "Test item $(repr(testitem.name)) timed out. Terminating worker $worker" - terminate!(worker) + trigger_profile(worker, :timeout) + terminate!(worker, :timeout) wait(worker) @error "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds. \ Recording test error." diff --git a/src/workers.jl b/src/workers.jl index 93ccde5b..ccfd6bf3 100644 --- a/src/workers.jl +++ b/src/workers.jl @@ -3,6 +3,7 @@ module Workers using Sockets, Serialization export Worker, remote_eval, remote_fetch, terminate!, WorkerTerminatedException +export trigger_profile function try_with_timeout(f, timeout) cond = Threads.Condition() @@ -104,6 +105,22 @@ function watch_and_terminate!(w::Worker) true end +# Send signal to the given `Worker` process to trigger a profile. +# Users can customise this profiling in the usual way, e.g. via +# `JULIA_PROFILE_PEEK_HEAP_SNAPSHOT`, `Profile.set_peek_duration`, `Profile.peek_report[]` +# See https://docs.julialang.org/en/v1/stdlib/Profile/#Triggered-During-Execution +function trigger_profile(w::Worker, from::Symbol=:manual) + if !Sys.iswindows() + @debug "sending profile request to worker $(w.pid) from $from" + if Sys.islinux() + kill(w.process, 10) # SIGUSR1 + elseif Sys.isbsd() + kill(w.process, 29) # SIGINFO + end + end + return nothing +end + # gracefully terminate a worker by sending a shutdown message # and waiting for the other tasks to perform worker shutdown function Base.close(w::Worker) From 525c176c689c310d6fe1cc18c598949127b3ac5c Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Thu, 18 Apr 2024 08:49:10 -0600 Subject: [PATCH 02/22] sleep after profile --- src/workers.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/workers.jl b/src/workers.jl index ccfd6bf3..4fc4ab3a 100644 --- a/src/workers.jl +++ b/src/workers.jl @@ -118,6 +118,7 @@ function trigger_profile(w::Worker, from::Symbol=:manual) kill(w.process, 29) # SIGINFO end end + sleep(120) # Leave time for it to print the profile. return nothing end From e8e906faf6801f14fb8023060bef82f3ddc0b8c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Mon, 22 Apr 2024 15:29:45 +0200 Subject: [PATCH 03/22] Introduce `timeout_profile_wait` to control timeout-triggered profiles + tests --- Project.toml | 4 ++- src/ReTestItems.jl | 24 +++++++++----- src/workers.jl | 8 +++-- test/integrationtests.jl | 67 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 12 deletions(-) diff --git a/Project.toml b/Project.toml index 03f7e22d..443df93b 100644 --- a/Project.toml +++ b/Project.toml @@ -17,6 +17,7 @@ Dates = "1" Logging = "1" LoggingExtras = "1" Pkg = "1" +Profile = "1" Random = "1" Serialization = "1" Sockets = "1" @@ -30,9 +31,10 @@ DeepDiffs = "ab62b9b5-e342-54a8-a765-a90f495de1a6" IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5" [targets] -test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Random", "Test", "XMLDict"] +test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Profile", "Random", "Test", "XMLDict"] diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index a33bcafb..56563a55 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -192,6 +192,11 @@ will be run. `paths` passed to it cannot contain test files, either because the path doesn't exist or the path points to a file which is not a test file. Default is `false`. Can also be set using the `RETESTITEMS_VALIDATE_PATHS` environment variable. +- `timeout_profile_wait::Real=0`: When non-zero, a worker that times-out will trigger a CPU profile + for which we will wait `timeout_profile_wait` seconds before terminating the worker. + Zero means no profile will be taken. Can also be set using the `RETESTITEMS_TIMEOUT_PROFILE_WAIT` + environment variable. See the [Profile documentation](https://docs.julialang.org/en/v1/stdlib/Profile/#Triggered-During-Execution) + for more information on triggered profiles. Note you can use `worker_init_expr` to tweak the profile settings on workers. """ function runtests end @@ -237,6 +242,7 @@ function runtests( verbose_results::Bool=(logs !== :issues && isinteractive()), test_end_expr::Expr=Expr(:block), validate_paths::Bool=parse(Bool, get(ENV, "RETESTITEMS_VALIDATE_PATHS", "false")), + timeout_profile_wait::Real=parse(Int, get(ENV, "RETESTITEMS_TIMEOUT_PROFILE_WAIT", "0")), ) nworker_threads = _validated_nworker_threads(nworker_threads) paths′ = _validated_paths(paths, validate_paths) @@ -253,13 +259,14 @@ function runtests( nworkers = max(0, nworkers) retries = max(0, retries) timeout = ceil(Int, testitem_timeout) + timeout_profile_wait = ceil(Int, timeout_profile_wait) debuglvl = Int(debug) if debuglvl > 0 LoggingExtras.withlevel(LoggingExtras.Debug; verbosity=debuglvl) do - _runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs) + _runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs, timeout_profile_wait) end else - return _runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs) + return _runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs, timeout_profile_wait) end end @@ -273,7 +280,7 @@ end # By tracking and reusing test environments, we can avoid this issue. const TEST_ENVS = Dict{String, String}() -function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, worker_init_expr::Expr, test_end_expr::Expr, testitem_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol) +function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, worker_init_expr::Expr, test_end_expr::Expr, testitem_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, timeout_profile_wait::Int) # Don't recursively call `runtests` e.g. if we `include` a file which calls it. # So we ignore the `runtests(...)` call in `test/runtests.jl` when `runtests(...)` # was called from the command line. @@ -293,7 +300,7 @@ function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, wor if is_running_test_runtests_jl(proj_file) # Assume this is `Pkg.test`, so test env already active. @debugv 2 "Running in current environment `$(Base.active_project())`" - return _runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs) + return _runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs, timeout_profile_wait) else @debugv 1 "Activating test environment for `$proj_file`" orig_proj = Base.active_project() @@ -306,7 +313,7 @@ function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, wor testenv = TestEnv.activate() TEST_ENVS[proj_file] = testenv end - _runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs) + _runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs, timeout_profile_wait) finally Base.set_active_project(orig_proj) end @@ -317,6 +324,7 @@ end function _runtests_in_current_env( shouldrun, paths, projectfile::String, nworkers::Int, nworker_threads, worker_init_expr::Expr, test_end_expr::Expr, testitem_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, + timeout_profile_wait::Int, ) start_time = time() proj_name = something(Pkg.Types.read_project(projectfile).name, "") @@ -381,7 +389,7 @@ function _runtests_in_current_env( ti = starting[i] @spawn begin with_logger(original_logger) do - manage_worker($w, $proj_name, $testitems, $ti, $nworker_threads, $worker_init_expr, $test_end_expr, $testitem_timeout, $retries, $memory_threshold, $verbose_results, $debug, $report, $logs) + manage_worker($w, $proj_name, $testitems, $ti, $nworker_threads, $worker_init_expr, $test_end_expr, $testitem_timeout, $retries, $memory_threshold, $verbose_results, $debug, $report, $logs, $timeout_profile_wait) end end end @@ -490,7 +498,7 @@ end function manage_worker( worker::Worker, proj_name::AbstractString, testitems::TestItems, testitem::Union{TestItem,Nothing}, nworker_threads, worker_init_expr::Expr, test_end_expr::Expr, - default_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol + default_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, timeout_profile_wait::Int ) ntestitems = length(testitems.testitems) run_number = 1 @@ -554,7 +562,7 @@ function manage_worker( # Handle the exception if e isa TimeoutException @debugv 1 "Test item $(repr(testitem.name)) timed out. Terminating worker $worker" - trigger_profile(worker, :timeout) + timeout_profile_wait > 0 && trigger_profile(worker, timeout_profile_wait, :timeout) terminate!(worker, :timeout) wait(worker) @error "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds. \ diff --git a/src/workers.jl b/src/workers.jl index 0a971609..7ef1b9dc 100644 --- a/src/workers.jl +++ b/src/workers.jl @@ -108,9 +108,11 @@ end # Send signal to the given `Worker` process to trigger a profile. # Users can customise this profiling in the usual way, e.g. via -# `JULIA_PROFILE_PEEK_HEAP_SNAPSHOT`, `Profile.set_peek_duration`, `Profile.peek_report[]` +# `JULIA_PROFILE_PEEK_HEAP_SNAPSHOT`, but `Profile.set_peek_duration`, `Profile.peek_report[]` +# would have to be modified in the worker process. # See https://docs.julialang.org/en/v1/stdlib/Profile/#Triggered-During-Execution -function trigger_profile(w::Worker, from::Symbol=:manual) +# Called when timeout_profile_wait is non-zero. +function trigger_profile(w::Worker, timeout_profile_wait, from::Symbol=:manual) if !Sys.iswindows() @debug "sending profile request to worker $(w.pid) from $from" if Sys.islinux() @@ -119,7 +121,7 @@ function trigger_profile(w::Worker, from::Symbol=:manual) kill(w.process, 29) # SIGINFO end end - sleep(120) # Leave time for it to print the profile. + sleep(timeout_profile_wait) # Leave time for it to print the profile. return nothing end diff --git a/test/integrationtests.jl b/test/integrationtests.jl index 42374825..bb628f76 100644 --- a/test/integrationtests.jl +++ b/test/integrationtests.jl @@ -819,6 +819,73 @@ end @test ts.time_end - ts.time_start ≈ timeout end +@testset "CPU profile timeout trigger" begin + using Profile + + function capture_timeout_profile(f, timeout_profile_wait; kwargs...) + logs = mktemp() do path, io + redirect_stdio(stdout=io, stderr=io, stdin=devnull) do + encased_testset() do + if isnothing(timeout_profile_wait) + runtests(joinpath(TEST_FILES_DIR, "_timeout_tests.jl"); nworkers=1, testitem_timeout=2, kwargs...) + else + runtests(joinpath(TEST_FILES_DIR, "_timeout_tests.jl"); nworkers=1, testitem_timeout=2, timeout_profile_wait, kwargs...) + end + end + end + flush(io) + close(io) + read(path, String) + end + f(logs) + return nothing + end + + # No profile is collected when timeout_profile_wait is zero. + capture_timeout_profile(0) do logs + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) + @test !occursin("Information request received", logs) + end + + # Profile is collected when timeout_profile_wait is non-zero. + default_peektime = Profile.get_peek_duration() + capture_timeout_profile(5) do logs + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) + @test occursin("Information request received. A stacktrace will print followed by a $(default_peektime) second profile", logs) + @test occursin("Profile collected.", logs) + end + + # Profile is collected with and set_peek_duration is respected. + capture_timeout_profile(5, worker_init_expr=:(using Profile; Profile.set_peek_duration($default_peektime + 1.0))) do logs + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) + @test occursin("Information request received. A stacktrace will print followed by a $(default_peektime + 1.0) second profile", logs) + @test occursin("Profile collected.", logs) + end + + + # The RETESTITEMS_TIMEOUT_PROFILE_WAIT environment variable can be used to set the timeout_profile_wait. + withenv("RETESTITEMS_TIMEOUT_PROFILE_WAIT" => "5") do + capture_timeout_profile(nothing) do logs + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) + @test occursin("Information request received", logs) + @test occursin("Profile collected.", logs) + end + end + + # The profile is collected for each worker thread. + capture_timeout_profile(5, nworker_threads="3,2") do logs + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) + @test occursin("Information request received", logs) + @test occursin(r"Worker \d+: Thread 1 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Worker \d+: Thread 2 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Worker \d+: Thread 3 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Worker \d+: Thread 4 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Worker \d+: Thread 5 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test !occursin(r"Worker \d+: Thread 6 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin("Profile collected.", logs) + end +end + @testset "worker always crashes immediately" begin file = joinpath(TEST_FILES_DIR, "_happy_tests.jl") From 2679113045452a7e227e857ab36b58241c3a51b2 Mon Sep 17 00:00:00 2001 From: Drvi Date: Mon, 22 Apr 2024 17:48:02 +0200 Subject: [PATCH 04/22] Print captured logs after we finish writing the profile --- src/ReTestItems.jl | 8 +++++++- test/integrationtests.jl | 16 +++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index 56563a55..361e3df1 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -558,23 +558,29 @@ function manage_worker( catch e @debugv 2 "Error" exception=e println(DEFAULT_STDOUT[]) - _print_captured_logs(DEFAULT_STDOUT[], testitem, run_number) # Handle the exception if e isa TimeoutException @debugv 1 "Test item $(repr(testitem.name)) timed out. Terminating worker $worker" timeout_profile_wait > 0 && trigger_profile(worker, timeout_profile_wait, :timeout) terminate!(worker, :timeout) wait(worker) + # TODO: We print the captured logs after the worker is terminated, + # which means that we include an annoying stackrace from the worker termination, + # but the profiles don't seem to get flushed properly if we don't do this. + # This is not an issue with eager logs, but when going through a file, this seems to help. + _print_captured_logs(DEFAULT_STDOUT[], testitem, run_number) @error "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds. \ Recording test error." record_timeout!(testitem, run_number, timeout) elseif e isa WorkerTerminatedException + _print_captured_logs(DEFAULT_STDOUT[], testitem, run_number) @error "$worker died running test item $(repr(testitem.name)). \ Recording test error." record_worker_terminated!(testitem, worker, run_number) else # We don't expect any other kind of error, so rethrow, which will propagate # back up to the main coordinator task and throw to the user + _print_captured_logs(DEFAULT_STDOUT[], testitem, run_number) rethrow() end # Handle retries diff --git a/test/integrationtests.jl b/test/integrationtests.jl index bb628f76..e36c6723 100644 --- a/test/integrationtests.jl +++ b/test/integrationtests.jl @@ -873,16 +873,18 @@ end end # The profile is collected for each worker thread. - capture_timeout_profile(5, nworker_threads="3,2") do logs + for log_capture in (:eager, :batched) + capture_timeout_profile(5, nworker_threads="3,2", logs=log_capture) do logs @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) @test occursin("Information request received", logs) - @test occursin(r"Worker \d+: Thread 1 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - @test occursin(r"Worker \d+: Thread 2 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - @test occursin(r"Worker \d+: Thread 3 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - @test occursin(r"Worker \d+: Thread 4 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - @test occursin(r"Worker \d+: Thread 5 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - @test !occursin(r"Worker \d+: Thread 6 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Thread 1 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Thread 2 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Thread 3 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Thread 4 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Thread 5 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test !occursin(r"Thread 6 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) @test occursin("Profile collected.", logs) + end end end From 3071046875483eeb7b92c6138866a563a9ea6f31 Mon Sep 17 00:00:00 2001 From: Drvi Date: Mon, 22 Apr 2024 18:29:51 +0200 Subject: [PATCH 05/22] Try forcing a flush on worker exit --- src/workers.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/workers.jl b/src/workers.jl index 7ef1b9dc..7c172909 100644 --- a/src/workers.jl +++ b/src/workers.jl @@ -295,6 +295,7 @@ function startworker() # don't need stdin (copied from Distributed.start_worker) redirect_stdin(devnull) close(stdin) + atexit(() -> (flush(stdout); flush(stderr))) redirect_stderr(stdout) # redirect stderr so coordinator reads everything from stdout port, sock = listenany(UInt16(rand(10000:50000))) # send the port to the coordinator From 251445112e3eb1fe5301b7e4c9ff906207e9b6c3 Mon Sep 17 00:00:00 2001 From: Drvi Date: Sat, 27 Apr 2024 13:46:38 +0200 Subject: [PATCH 06/22] Test stacktraces are printed instead of testing the profile (which was too flaky) --- test/integrationtests.jl | 46 +++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/test/integrationtests.jl b/test/integrationtests.jl index e36c6723..efed7837 100644 --- a/test/integrationtests.jl +++ b/test/integrationtests.jl @@ -821,15 +821,16 @@ end @testset "CPU profile timeout trigger" begin using Profile - + # We're only testing that the signal was registered and that the stacktrace was printed. + # We also tried testing that the CPU profile was displayed, but that was too flaky in CI. function capture_timeout_profile(f, timeout_profile_wait; kwargs...) logs = mktemp() do path, io redirect_stdio(stdout=io, stderr=io, stdin=devnull) do encased_testset() do if isnothing(timeout_profile_wait) - runtests(joinpath(TEST_FILES_DIR, "_timeout_tests.jl"); nworkers=1, testitem_timeout=2, kwargs...) + runtests(joinpath(TEST_FILES_DIR, "_timeout_tests.jl"); nworkers=1, testitem_timeout=3, kwargs...) else - runtests(joinpath(TEST_FILES_DIR, "_timeout_tests.jl"); nworkers=1, testitem_timeout=2, timeout_profile_wait, kwargs...) + runtests(joinpath(TEST_FILES_DIR, "_timeout_tests.jl"); nworkers=1, testitem_timeout=3, timeout_profile_wait, kwargs...) end end end @@ -838,51 +839,56 @@ end read(path, String) end f(logs) - return nothing + return logs end - # No profile is collected when timeout_profile_wait is zero. + @testset "timeout_profile_wait=0 means no CPU profile" begin capture_timeout_profile(0) do logs - @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) @test !occursin("Information request received", logs) + end end - # Profile is collected when timeout_profile_wait is non-zero. + default_peektime = Profile.get_peek_duration() + @testset "non-zero timeout_profile_wait means we collect a CPU profile" begin capture_timeout_profile(5) do logs - @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) @test occursin("Information request received. A stacktrace will print followed by a $(default_peektime) second profile", logs) + @test count(r"pthread_cond_wait|__psych_cvwait", logs) > 0 # the stacktrace was printed (will fail on Windows) @test occursin("Profile collected.", logs) + end end - # Profile is collected with and set_peek_duration is respected. + + @testset "`set_peek_duration` is respected in `worker_init_expr`" begin capture_timeout_profile(5, worker_init_expr=:(using Profile; Profile.set_peek_duration($default_peektime + 1.0))) do logs - @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) @test occursin("Information request received. A stacktrace will print followed by a $(default_peektime + 1.0) second profile", logs) + @test count(r"pthread_cond_wait|__psych_cvwait", logs) > 0 # the stacktrace was printed (will fail on Windows) @test occursin("Profile collected.", logs) + end end # The RETESTITEMS_TIMEOUT_PROFILE_WAIT environment variable can be used to set the timeout_profile_wait. + @testset "RETESTITEMS_TIMEOUT_PROFILE_WAIT environment variable" begin withenv("RETESTITEMS_TIMEOUT_PROFILE_WAIT" => "5") do capture_timeout_profile(nothing) do logs - @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) @test occursin("Information request received", logs) + @test count(r"pthread_cond_wait|__psych_cvwait", logs) > 0 # the stacktrace was printed (will fail on Windows) @test occursin("Profile collected.", logs) + end end end # The profile is collected for each worker thread. - for log_capture in (:eager, :batched) - capture_timeout_profile(5, nworker_threads="3,2", logs=log_capture) do logs - @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 2 seconds", logs) + @testset "CPU profile with $(repr(log_capture)) logs" for log_capture in (:eager, :batched) + logs = capture_timeout_profile(5, nworker_threads="3,2", logs=log_capture) do logs + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) @test occursin("Information request received", logs) - @test occursin(r"Thread 1 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - @test occursin(r"Thread 2 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - @test occursin(r"Thread 3 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - @test occursin(r"Thread 4 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - @test occursin(r"Thread 5 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - @test !occursin(r"Thread 6 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test count(r"pthread_cond_wait|__psych_cvwait", logs) > 0 # the stacktrace was printed (will fail on Windows) @test occursin("Profile collected.", logs) end end From fcb2b850434801f9263394550e189da4d8684554 Mon Sep 17 00:00:00 2001 From: Drvi Date: Sat, 27 Apr 2024 14:22:42 +0200 Subject: [PATCH 07/22] 1.8 doesn't have interactive threads --- test/integrationtests.jl | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/test/integrationtests.jl b/test/integrationtests.jl index efed7837..9bae7dc9 100644 --- a/test/integrationtests.jl +++ b/test/integrationtests.jl @@ -839,12 +839,12 @@ end read(path, String) end f(logs) + @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) return logs end @testset "timeout_profile_wait=0 means no CPU profile" begin capture_timeout_profile(0) do logs - @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) @test !occursin("Information request received", logs) end end @@ -853,7 +853,6 @@ end default_peektime = Profile.get_peek_duration() @testset "non-zero timeout_profile_wait means we collect a CPU profile" begin capture_timeout_profile(5) do logs - @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) @test occursin("Information request received. A stacktrace will print followed by a $(default_peektime) second profile", logs) @test count(r"pthread_cond_wait|__psych_cvwait", logs) > 0 # the stacktrace was printed (will fail on Windows) @test occursin("Profile collected.", logs) @@ -863,7 +862,6 @@ end @testset "`set_peek_duration` is respected in `worker_init_expr`" begin capture_timeout_profile(5, worker_init_expr=:(using Profile; Profile.set_peek_duration($default_peektime + 1.0))) do logs - @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) @test occursin("Information request received. A stacktrace will print followed by a $(default_peektime + 1.0) second profile", logs) @test count(r"pthread_cond_wait|__psych_cvwait", logs) > 0 # the stacktrace was printed (will fail on Windows) @test occursin("Profile collected.", logs) @@ -875,7 +873,6 @@ end @testset "RETESTITEMS_TIMEOUT_PROFILE_WAIT environment variable" begin withenv("RETESTITEMS_TIMEOUT_PROFILE_WAIT" => "5") do capture_timeout_profile(nothing) do logs - @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) @test occursin("Information request received", logs) @test count(r"pthread_cond_wait|__psych_cvwait", logs) > 0 # the stacktrace was printed (will fail on Windows) @test occursin("Profile collected.", logs) @@ -884,9 +881,8 @@ end end # The profile is collected for each worker thread. - @testset "CPU profile with $(repr(log_capture)) logs" for log_capture in (:eager, :batched) - logs = capture_timeout_profile(5, nworker_threads="3,2", logs=log_capture) do logs - @assert occursin("timed out running test item \"Test item takes 60 seconds\" after 3 seconds", logs) + @testset "CPU profile with $(repr(log_capture))" for log_capture in (:eager, :batched) + capture_timeout_profile(5, nworker_threads=VERSION >= v"1.9" ? "3,2" : "3", logs=log_capture) do logs @test occursin("Information request received", logs) @test count(r"pthread_cond_wait|__psych_cvwait", logs) > 0 # the stacktrace was printed (will fail on Windows) @test occursin("Profile collected.", logs) From 3407e7e73265206654ba06d08ab1fae351b4a9bd Mon Sep 17 00:00:00 2001 From: Drvi Date: Sat, 27 Apr 2024 14:30:18 +0200 Subject: [PATCH 08/22] . --- src/workers.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/workers.jl b/src/workers.jl index 7c172909..7ef1b9dc 100644 --- a/src/workers.jl +++ b/src/workers.jl @@ -295,7 +295,6 @@ function startworker() # don't need stdin (copied from Distributed.start_worker) redirect_stdin(devnull) close(stdin) - atexit(() -> (flush(stdout); flush(stderr))) redirect_stderr(stdout) # redirect stderr so coordinator reads everything from stdout port, sock = listenany(UInt16(rand(10000:50000))) # send the port to the coordinator From 83ecc8ccd671b72f9e66716eaf5bf5d850ab5bec Mon Sep 17 00:00:00 2001 From: Drvi Date: Sat, 27 Apr 2024 14:48:57 +0200 Subject: [PATCH 09/22] Add unit test --- test/integrationtests.jl | 2 +- test/workers.jl | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/test/integrationtests.jl b/test/integrationtests.jl index 9bae7dc9..d18e005c 100644 --- a/test/integrationtests.jl +++ b/test/integrationtests.jl @@ -822,7 +822,7 @@ end @testset "CPU profile timeout trigger" begin using Profile # We're only testing that the signal was registered and that the stacktrace was printed. - # We also tried testing that the CPU profile was displayed, but that was too flaky in CI. + # We also tried testing that the CPU profile was displayed here, but that was too flaky in CI. function capture_timeout_profile(f, timeout_profile_wait; kwargs...) logs = mktemp() do path, io redirect_stdio(stdout=io, stderr=io, stdin=devnull) do diff --git a/test/workers.jl b/test/workers.jl index 40b0e802..4e69fee3 100644 --- a/test/workers.jl +++ b/test/workers.jl @@ -93,4 +93,28 @@ using Test close(w) end + @testset "CPU profile" begin + logs = mktemp() do path, io + w = Worker(threads=VERSION > v"1.9" ? "3,2" : "3", worker_redirect_io=io) + fut = remote_eval(w, :(sleep(5), yield())) + sleep(0.5) + trigger_profile(w, 1, :test) + fetch(fut) + close(w) + flush(io) + close(io) + return read(path, String) + end + + @test occursin(r"Thread 1 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Thread 2 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Thread 3 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + if VERSION > v"1.9" + @test occursin(r"Thread 4 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test occursin(r"Thread 5 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + @test !occursin(r"Thread 6 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + else + @test !occursin(r"Thread 4 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) + end + end end # workers.jl testset From 6827599e416ce6c47b8bb1183d0fe05420a8485f Mon Sep 17 00:00:00 2001 From: Drvi Date: Sat, 27 Apr 2024 14:50:28 +0200 Subject: [PATCH 10/22] . --- test/workers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/workers.jl b/test/workers.jl index 4e69fee3..65dbd243 100644 --- a/test/workers.jl +++ b/test/workers.jl @@ -95,7 +95,7 @@ using Test @testset "CPU profile" begin logs = mktemp() do path, io - w = Worker(threads=VERSION > v"1.9" ? "3,2" : "3", worker_redirect_io=io) + w = Worker(threads=VERSION >= v"1.9" ? "3,2" : "3", worker_redirect_io=io) fut = remote_eval(w, :(sleep(5), yield())) sleep(0.5) trigger_profile(w, 1, :test) From 864d6cfa6bb722cc6f99c9ed5e9f9fc9bd6b06f4 Mon Sep 17 00:00:00 2001 From: Drvi Date: Sat, 27 Apr 2024 14:55:39 +0200 Subject: [PATCH 11/22] . --- test/workers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/workers.jl b/test/workers.jl index 65dbd243..6eece79a 100644 --- a/test/workers.jl +++ b/test/workers.jl @@ -109,7 +109,7 @@ using Test @test occursin(r"Thread 1 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) @test occursin(r"Thread 2 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) @test occursin(r"Thread 3 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) - if VERSION > v"1.9" + if VERSION >= v"1.9" @test occursin(r"Thread 4 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) @test occursin(r"Thread 5 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) @test !occursin(r"Thread 6 Task 0x\w+ Total snapshots: \d+. Utilization: \d+%", logs) From d1dc637e575a26d5f3656f9cfcd2b71659f78cc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Mon, 29 Apr 2024 14:22:58 +0200 Subject: [PATCH 12/22] Print newline just before the logs --- src/ReTestItems.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index 97329962..ce052cc8 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -559,10 +559,9 @@ function manage_worker( end catch e @debugv 2 "Error" exception=e - println(DEFAULT_STDOUT[]) # Handle the exception if e isa TimeoutException - @debugv 1 "Test item $(repr(testitem.name)) timed out. Terminating worker $worker" + @debugv 1 "Test item $(repr(testitem.name)) timed out. Terminating worker $worker $(timeout_profile_wait > 0 ? "and triggering profile" : "")." timeout_profile_wait > 0 && trigger_profile(worker, timeout_profile_wait, :timeout) terminate!(worker, :timeout) wait(worker) @@ -570,11 +569,13 @@ function manage_worker( # which means that we include an annoying stackrace from the worker termination, # but the profiles don't seem to get flushed properly if we don't do this. # This is not an issue with eager logs, but when going through a file, this seems to help. + println(DEFAULT_STDOUT[]) _print_captured_logs(DEFAULT_STDOUT[], testitem, run_number) @error "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds. \ Recording test error." record_timeout!(testitem, run_number, timeout) elseif e isa WorkerTerminatedException + println(DEFAULT_STDOUT[]) _print_captured_logs(DEFAULT_STDOUT[], testitem, run_number) @error "$worker died running test item $(repr(testitem.name)). \ Recording test error." @@ -582,6 +583,7 @@ function manage_worker( else # We don't expect any other kind of error, so rethrow, which will propagate # back up to the main coordinator task and throw to the user + println(DEFAULT_STDOUT[]) _print_captured_logs(DEFAULT_STDOUT[], testitem, run_number) rethrow() end From 931bde42c1165535d8cfbcaf659aad1d7e6a5bda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Mon, 29 Apr 2024 14:49:15 +0200 Subject: [PATCH 13/22] Experiment: custom `peek_report` --- Project.toml | 4 ++-- src/ReTestItems.jl | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 6881085a..3676e463 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" @@ -31,10 +32,9 @@ DeepDiffs = "ab62b9b5-e342-54a8-a765-a90f495de1a6" IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5" [targets] -test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Profile", "Random", "Test", "XMLDict"] +test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Random", "Test", "XMLDict"] diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index ce052cc8..751544f5 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -416,6 +416,19 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems; # remote_fetch here because we want to make sure the worker is all setup before starting to eval testitems remote_fetch(w, quote using ReTestItems, Test + let + import Profile: peek_report + # Like https://github.com/JuliaLang/julia/blob/6023ad6718514c15b3297197757ae3d93b85270b/stdlib/Profile/src/Profile.jl#L65-L70 + # but doesn't limit the IOContext to the default display size + function _peek_report() + iob = IOBuffer() + println(iob, "Worker$i: CPU profile") + ioc = IOContext(iob, stdout) + print(ioc, groupby = [:thread, :task]) + Base.print(stdout, String(resize!(iob.data, iob.size))) + end + peek_report[] = _peek_report + end Test.TESTSET_PRINT_ENABLE[] = false const GLOBAL_TEST_CONTEXT = ReTestItems.TestContext($proj_name, $ntestitems) GLOBAL_TEST_CONTEXT.setups_evaled = ReTestItems.TestSetupModules() From 699d37ff6691b5da627b42f5fa5b7dd63bde38e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Mon, 29 Apr 2024 15:05:56 +0200 Subject: [PATCH 14/22] . --- src/ReTestItems.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index 751544f5..3237bb45 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -5,6 +5,7 @@ using Dates: DateTime, ISODateTimeFormat, format, now, unix2datetime using Test: Test, DefaultTestSet, TestSetException using .Threads: @spawn, nthreads using Pkg: Pkg +using Profile using TestEnv using Logging using LoggingExtras @@ -417,7 +418,6 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems; remote_fetch(w, quote using ReTestItems, Test let - import Profile: peek_report # Like https://github.com/JuliaLang/julia/blob/6023ad6718514c15b3297197757ae3d93b85270b/stdlib/Profile/src/Profile.jl#L65-L70 # but doesn't limit the IOContext to the default display size function _peek_report() @@ -427,7 +427,7 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems; print(ioc, groupby = [:thread, :task]) Base.print(stdout, String(resize!(iob.data, iob.size))) end - peek_report[] = _peek_report + ReTestItems.Profile.peek_report[] = _peek_report end Test.TESTSET_PRINT_ENABLE[] = false const GLOBAL_TEST_CONTEXT = ReTestItems.TestContext($proj_name, $ntestitems) From 943ded452d38a5ca77f4aaf37021a9a5a209aed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Mon, 29 Apr 2024 15:12:46 +0200 Subject: [PATCH 15/22] . --- src/ReTestItems.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index 3237bb45..b4512645 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -420,6 +420,7 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems; let # Like https://github.com/JuliaLang/julia/blob/6023ad6718514c15b3297197757ae3d93b85270b/stdlib/Profile/src/Profile.jl#L65-L70 # but doesn't limit the IOContext to the default display size + Profile = Base.require(@__MODULE__, :Profile) function _peek_report() iob = IOBuffer() println(iob, "Worker$i: CPU profile") @@ -427,7 +428,7 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems; print(ioc, groupby = [:thread, :task]) Base.print(stdout, String(resize!(iob.data, iob.size))) end - ReTestItems.Profile.peek_report[] = _peek_report + peek_report[] = _peek_report end Test.TESTSET_PRINT_ENABLE[] = false const GLOBAL_TEST_CONTEXT = ReTestItems.TestContext($proj_name, $ntestitems) From c72cbeab3794bbce168148719ca1113486eacbd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Mon, 29 Apr 2024 15:14:04 +0200 Subject: [PATCH 16/22] . --- src/ReTestItems.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index b4512645..bea9a81a 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -428,7 +428,7 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems; print(ioc, groupby = [:thread, :task]) Base.print(stdout, String(resize!(iob.data, iob.size))) end - peek_report[] = _peek_report + Profile.peek_report[] = _peek_report end Test.TESTSET_PRINT_ENABLE[] = false const GLOBAL_TEST_CONTEXT = ReTestItems.TestContext($proj_name, $ntestitems) From 52dbcce2e2f7ff362a39343eda661e2db4df2af9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Mon, 29 Apr 2024 15:44:06 +0200 Subject: [PATCH 17/22] Well, that didn't work, just increase the displaysize --- Project.toml | 4 ++-- src/ReTestItems.jl | 14 -------------- src/log_capture.jl | 4 +++- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/Project.toml b/Project.toml index 3676e463..6881085a 100644 --- a/Project.toml +++ b/Project.toml @@ -7,7 +7,6 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" @@ -32,9 +31,10 @@ DeepDiffs = "ab62b9b5-e342-54a8-a765-a90f495de1a6" IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5" [targets] -test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Random", "Test", "XMLDict"] +test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Profile", "Random", "Test", "XMLDict"] diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index bea9a81a..ce052cc8 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -5,7 +5,6 @@ using Dates: DateTime, ISODateTimeFormat, format, now, unix2datetime using Test: Test, DefaultTestSet, TestSetException using .Threads: @spawn, nthreads using Pkg: Pkg -using Profile using TestEnv using Logging using LoggingExtras @@ -417,19 +416,6 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems; # remote_fetch here because we want to make sure the worker is all setup before starting to eval testitems remote_fetch(w, quote using ReTestItems, Test - let - # Like https://github.com/JuliaLang/julia/blob/6023ad6718514c15b3297197757ae3d93b85270b/stdlib/Profile/src/Profile.jl#L65-L70 - # but doesn't limit the IOContext to the default display size - Profile = Base.require(@__MODULE__, :Profile) - function _peek_report() - iob = IOBuffer() - println(iob, "Worker$i: CPU profile") - ioc = IOContext(iob, stdout) - print(ioc, groupby = [:thread, :task]) - Base.print(stdout, String(resize!(iob.data, iob.size))) - end - Profile.peek_report[] = _peek_report - end Test.TESTSET_PRINT_ENABLE[] = false const GLOBAL_TEST_CONTEXT = ReTestItems.TestContext($proj_name, $ntestitems) GLOBAL_TEST_CONTEXT.setups_evaled = ReTestItems.TestSetupModules() diff --git a/src/log_capture.jl b/src/log_capture.jl index 968cee70..6d36ac9d 100644 --- a/src/log_capture.jl +++ b/src/log_capture.jl @@ -107,7 +107,9 @@ If target is String it is assumed it is a file path. _redirect_logs(f, path::String) = open(io->_redirect_logs(f, io), path, "w") function _redirect_logs(f, target::IO) target === DEFAULT_STDOUT[] && return f() - colored_io = IOContext(target, :color => get(DEFAULT_STDOUT[], :color, false)) + # If we're not doing :eager logs, make sure the displaysize is large so we don't truncate + # CPU profiles. + colored_io = IOContext(target, :color => get(DEFAULT_STDOUT[], :color, false), :displaysize => (10000,10000)) # In case the default logger was changed by the user, we need to make sure the new logstate # is poinitng to the new stderr. # Adapted from https://github.com/JuliaIO/Suppressor.jl/blob/cbfc46f1450b03d6b69dad4c35de739290ff0aff/src/Suppressor.jl#L158-L161 From 1a29db0757f36d17b904d3e96d9266e77fb51314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Tue, 30 Apr 2024 12:21:10 +0200 Subject: [PATCH 18/22] Give the user a heads up about the CPU profile being collected --- src/ReTestItems.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index ce052cc8..feb9b821 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -561,8 +561,11 @@ function manage_worker( @debugv 2 "Error" exception=e # Handle the exception if e isa TimeoutException - @debugv 1 "Test item $(repr(testitem.name)) timed out. Terminating worker $worker $(timeout_profile_wait > 0 ? "and triggering profile" : "")." - timeout_profile_wait > 0 && trigger_profile(worker, timeout_profile_wait, :timeout) + if timeout_profile_wait > 0 + @warn "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds. \ + A CPU profile will be triggered on the worker and then it will be terminated." + trigger_profile(worker, timeout_profile_wait, :timeout) + end terminate!(worker, :timeout) wait(worker) # TODO: We print the captured logs after the worker is terminated, From d42ae31a2456a3beb1c1cdee35957b65701081bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Tue, 30 Apr 2024 14:06:49 +0200 Subject: [PATCH 19/22] v1.24.0 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 6881085a..9cc91392 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "ReTestItems" uuid = "817f1d60-ba6b-4fd5-9520-3cf149f6a823" -version = "1.23.3" +version = "1.24.0" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" From 43df1a8483cc6b887c06daacdb72c7c07c0c9a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Mon, 6 May 2024 17:25:58 +0200 Subject: [PATCH 20/22] PR feedback --- Project.toml | 2 +- src/ReTestItems.jl | 4 +++- src/workers.jl | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 9cc91392..6881085a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "ReTestItems" uuid = "817f1d60-ba6b-4fd5-9520-3cf149f6a823" -version = "1.24.0" +version = "1.23.3" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index feb9b821..070790f2 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -250,7 +250,8 @@ function runtests( logs in LOG_DISPLAY_MODES || throw(ArgumentError("`logs` must be one of $LOG_DISPLAY_MODES, got $(repr(logs))")) report && logs == :eager && throw(ArgumentError("`report=true` is not compatible with `logs=:eager`")) (0 ≤ memory_threshold ≤ 1) || throw(ArgumentError("`memory_threshold` must be between 0 and 1, got $(repr(memory_threshold))")) - testitem_timeout > 0 || throw(ArgumentError("`testitem_timeout` must be a postive number, got $(repr(testitem_timeout))")) + testitem_timeout > 0 || throw(ArgumentError("`testitem_timeout` must be a positive number, got $(repr(testitem_timeout))")) + timeout_profile_wait > 0 || throw(ArgumentError("`timeout_profile_wait` must be a positive number, got $(repr(timeout_profile_wait))")) # If we were given paths but none were valid, then nothing to run. !isempty(paths) && isempty(paths′) && return nothing shouldrun_combined(ti) = shouldrun(ti) && _shouldrun(name, ti.name) && _shouldrun(tags, ti.tags) @@ -260,6 +261,7 @@ function runtests( retries = max(0, retries) timeout = ceil(Int, testitem_timeout) timeout_profile_wait = ceil(Int, timeout_profile_wait) + (timeout_profile_wait > 0 && Sys.iswindows()) && @warn "CPU profiles on timeout is not supported on Windows, ignoring `timeout_profile_wait`" debuglvl = Int(debug) if debuglvl > 0 LoggingExtras.withlevel(LoggingExtras.Debug; verbosity=debuglvl) do diff --git a/src/workers.jl b/src/workers.jl index 7ef1b9dc..ebaac6f3 100644 --- a/src/workers.jl +++ b/src/workers.jl @@ -120,8 +120,8 @@ function trigger_profile(w::Worker, timeout_profile_wait, from::Symbol=:manual) elseif Sys.isbsd() kill(w.process, 29) # SIGINFO end + sleep(timeout_profile_wait) # Leave time for it to print the profile. end - sleep(timeout_profile_wait) # Leave time for it to print the profile. return nothing end From 9d4f1b97a580f25d7974ed3e8de1fc841929da79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Mon, 6 May 2024 17:28:43 +0200 Subject: [PATCH 21/22] . --- src/ReTestItems.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl index 070790f2..878e9934 100644 --- a/src/ReTestItems.jl +++ b/src/ReTestItems.jl @@ -251,7 +251,7 @@ function runtests( report && logs == :eager && throw(ArgumentError("`report=true` is not compatible with `logs=:eager`")) (0 ≤ memory_threshold ≤ 1) || throw(ArgumentError("`memory_threshold` must be between 0 and 1, got $(repr(memory_threshold))")) testitem_timeout > 0 || throw(ArgumentError("`testitem_timeout` must be a positive number, got $(repr(testitem_timeout))")) - timeout_profile_wait > 0 || throw(ArgumentError("`timeout_profile_wait` must be a positive number, got $(repr(timeout_profile_wait))")) + timeout_profile_wait >= 0 || throw(ArgumentError("`timeout_profile_wait` must be a non-negative number, got $(repr(timeout_profile_wait))")) # If we were given paths but none were valid, then nothing to run. !isempty(paths) && isempty(paths′) && return nothing shouldrun_combined(ti) = shouldrun(ti) && _shouldrun(name, ti.name) && _shouldrun(tags, ti.tags) From 9c72a0d7d45e677be46584090f060ee26c90f59c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Mon, 6 May 2024 17:44:40 +0200 Subject: [PATCH 22/22] :) --- test/integrationtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integrationtests.jl b/test/integrationtests.jl index d18e005c..20761260 100644 --- a/test/integrationtests.jl +++ b/test/integrationtests.jl @@ -683,7 +683,7 @@ end @test err.value == string(ErrorException("Timed out after 4s running test item \"Test item takes 60 seconds\" (run=1)")) for t in (0, -1.1) - expected = ArgumentError("`testitem_timeout` must be a postive number, got $t") + expected = ArgumentError("`testitem_timeout` must be a positive number, got $t") @test_throws expected runtests(file; nworkers, testitem_timeout=t) end end