diff --git a/CHANGELOG.md b/CHANGELOG.md index bd0bd942..eca211e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ([#898](https://github.com/ericaltendorf/plotman/pull/898)) - Output same entries to plotman.log from 'plotman interactive' and ' plotman plot/archive' "daemons". ([#878](https://github.com/ericaltendorf/plotman/pull/878)) +- `plotman graph` command to create a matplotlib plot for completed plots. + Creates a graph image showing plots over time, average plot rate, average plot time, and total number of plots over time. + ([#612](https://github.com/ericaltendorf/plotman/pull/612)) ## [0.5.1] - 2021-07-15 ### Fixed diff --git a/mypy.ini b/mypy.ini index 2b2c6b0f..d4e50318 100644 --- a/mypy.ini +++ b/mypy.ini @@ -9,6 +9,9 @@ ignore_missing_imports = true [mypy-click] ignore_missing_imports = true +[mypy-matplotlib] +ignore_missing_imports = true + [mypy-pendulum] # TODO: https://github.com/sdispater/pendulum/pull/551 implicit_reexport = true diff --git a/setup.cfg b/setup.cfg index b43ad338..28933724 100644 --- a/setup.cfg +++ b/setup.cfg @@ -73,6 +73,10 @@ checks = mypy == 0.902 types-pkg_resources ~= 0.1.2 %(test)s + %(graph)s +graph = + matplotlib ~= 3.4 + numpy ~= 1.20 [options.data_files] config = src/plotman/resources/plotman.yaml diff --git a/src/plotman/graph.py b/src/plotman/graph.py new file mode 100644 index 00000000..7aa0cddf --- /dev/null +++ b/src/plotman/graph.py @@ -0,0 +1,203 @@ +import os + +import numpy as np +import matplotlib +import matplotlib.pyplot as plt + +import plotman.plotters + + +def create_ax_dumbbell( + ax: matplotlib.pyplot.axis, data: np.array, max_stacked: int = 50 +) -> None: + """ + Create a dumbbell plot of concurrent plot instances over time. + Parameters: + ax: a matplotlib axis. + data: numpy arrary with [start times, end times]. + """ + + def newline(p1: float, p2: float) -> matplotlib.lines.Line2D: + l = matplotlib.lines.Line2D([p1[0], p2[0]], [p1[1], p2[1]], color="r") + ax.add_line(l) + return l + + # Prevent the stack from growing to tall + num_rows = data.shape[0] + stacker = [] + for _ in range(int(np.ceil(num_rows / float(max_stacked)))): + stacker.extend(list(range(max_stacked))) + stacker = np.array(stacker) + if num_rows % float(max_stacked) != 0: + stacker = stacker[: -(max_stacked - int(num_rows % float(max_stacked)))] + + for (p1, p2), i in zip(data[:, :2], stacker): + newline([p1, i], [p2, i]) + ax.scatter(data[:, 0], stacker, color="b") + ax.scatter(data[:, 1], stacker, color="b") + + ax.set_ylabel("Plots") + ax.set_xlim(np.min(data[:, 0]) - 2, np.max(data[:, 1]) + 2) + + +def create_ax_plotrate( + ax: matplotlib.pyplot.axis, data: np.array, end: bool = True, window: int = 3 +) -> None: + """ + Create a plot showing the rate of plotting over time. Can be computed + with respect to the plot start (this is rate of plot creation) or + with respect to the plot end (this is rate of plot completion). + Parameters: + ax: a matplotlib axis. + data: numpy arrary with [start times, end times]. + end: T/F, compute plot creation or plot completion rate. + window: Window to compute rate over. + """ + + def estimate_rate(data: np.array, window: int) -> np.array: + rate_list = [] + window_list = [] + # This takes care of when we dont have a full window + for i in range(window): + rate_list.append(data[i] - data[0]) + window_list.append(i) + # This takes care of when we do + for i in range(len(data) - window): + rate_list.append(data[i + window] - data[i]) + window_list.append(window) + rate_list, window_list = np.array(rate_list), np.array(window_list) + rate_list[rate_list == 0] = np.nan # This prevents div by zero error + return np.where( + np.logical_not(np.isnan(rate_list)), (window_list - 1) / rate_list, 0 + ) + + # Estimate the rate of ending or the rate of starting + if end: + rate = estimate_rate(data[:, 1], window) + ax.plot(data[:, 1], rate) + else: + rate = estimate_rate(data[:, 0], window) + ax.plot(data[:, 0], rate) + + ax.set_ylabel("Avg Plot Rate (plots/hour)") + ax.set_xlim(np.min(data[:, 0]) - 2, np.max(data[:, 1]) + 2) + + +def create_ax_plottime( + ax: matplotlib.pyplot.axis, data: np.array, window: int = 3 +) -> None: + """ + Create a plot showing the average time to create a single plot. This is + computed using a moving average. Note that the plot may not be + very accurate for the beginning and ending windows. + Parameters: + ax: a matplotlib axis. + data: numpy arrary with [start times, end times]. + window: Window to compute rate over. + """ + + # Compute moving avg + kernel = np.ones(window) / window + data_tiled = np.vstack( + ( + np.expand_dims(data[:, 1] - data[:, 0], axis=1), + np.tile(data[-1, 1] - data[-1, 0], (window - 1, 1)), + ) + ) + rolling_avg = np.convolve(data_tiled.squeeze(), kernel, mode="valid") + + ax.plot(data[:, 1], rolling_avg) + + ax.set_ylabel("Avg Plot Time (hours)") + ax.set_xlim(np.min(data[:, 0]) - 2, np.max(data[:, 1]) + 2) + + +def create_ax_plotcumulative(ax: matplotlib.pyplot.axis, data: np.array) -> None: + """ + Create a plot showing the cumulative number of plots over time. + Parameters: + ax: a matplotlib axis. + data: numpy arrary with [start times, end times]. + """ + ax.plot(data[:, 1], range(data.shape[0])) + + ax.set_ylabel("Total plots (plots)") + ax.set_xlim(np.min(data[:, 0]) - 2, np.max(data[:, 1]) + 2) + + +def graph(logdir: str, figfile: str, latest_k: int, window: int) -> None: + assert window >= 2, "Cannot compute moving average over a window less than 3" + assert os.path.isdir(logdir) + + # Build a list of the logfiles + logdir = os.path.abspath(logdir) + logfilenames = [ + os.path.join(logdir, l) + for l in os.listdir(logdir) + if os.path.splitext(l)[-1] == ".log" + ] + + assert len(logfilenames) > 0, "Directory contains no files {}".format(logdir) + + # For each log file, extract the start, end, and duration + time_catter = [] + for logfilename in logfilenames: + with open(logfilename) as file: + try: + plotter_type = plotman.plotters.get_plotter_from_log(lines=file) + except plotman.errors.UnableToIdentifyPlotterFromLogError: + continue + + parser = plotter_type() + + with open(logfilename, "rb") as binary_file: + read_bytes = binary_file.read() + + parser.update(chunk=read_bytes) + info = parser.common_info() + + # Extract timing information + if info.total_time_raw != 0: + time_catter.append( + [ + info.started_at.timestamp(), + info.started_at.timestamp() + info.total_time_raw, + info.total_time_raw, + ] + ) + + assert len(time_catter) > 0, "No valid log files found" + + # This array will hold start and end data (in hours) + data_started_ended = np.array(time_catter) / (60 * 60) + + # Shift the data so that it starts at zero + data_started_ended -= np.min(data_started_ended[:, 0]) + + # Sort the rows by start time + data_started_ended = data_started_ended[np.argsort(data_started_ended[:, 0])] + + # Remove older entries + if latest_k is not None: + data_started_ended = data_started_ended[-latest_k:, :] + + # Create figure + num_plots = 4 + f, _ = plt.subplots(2, 1, figsize=(8, 10)) + ax = plt.subplot(num_plots, 1, 1) + ax.set_title("Plot performance summary") + + create_ax_dumbbell(ax, data_started_ended) + + if data_started_ended.shape[0] > window: + ax = plt.subplot(num_plots, 1, 2) + create_ax_plotrate(ax, data_started_ended, end=True, window=window) + + ax = plt.subplot(num_plots, 1, 3) + create_ax_plottime(ax, data_started_ended, window=window) + + ax = plt.subplot(num_plots, 1, 4) + create_ax_plotcumulative(ax, data_started_ended) + + ax.set_xlabel("Time (hours)") + f.savefig(figfile) diff --git a/src/plotman/plotman.py b/src/plotman/plotman.py old mode 100755 new mode 100644 index fc0fa68a..8ad74488 --- a/src/plotman/plotman.py +++ b/src/plotman/plotman.py @@ -24,6 +24,7 @@ plot_util, reporting, csv_exporter, + graph, ) from plotman import resources as plotman_resources from plotman.job import Job @@ -157,6 +158,29 @@ def parse_args(self) -> typing.Any: "logfile", type=str, nargs="+", help="logfile(s) to analyze" ) + p_graph = sp.add_parser("graph", help="create graph with plotting statistics") + p_graph.add_argument( + "figfile", type=str, help="graph file produced as output (.png, .jpg, etc.)" + ) + p_graph.add_argument( + "--logdir", + type=str, + default=None, + help="directory containing multiple logfiles to graph", + ) + p_graph.add_argument( + "--latest_k", + type=int, + default=None, + help="if passed, will only graph statistics for the latest k plots", + ) + p_graph.add_argument( + "--window", + type=int, + default=3, + help="window size to compute moving average over", + ) + args = parser.parse_args() return args @@ -296,6 +320,15 @@ def main() -> None: args.logfile, args.clipterminals, args.bytmp, args.bybitfield ) + # + # Graphing of completed jobs + # + elif args.cmd == "graph": + # If no logdir was passed, use the dir specified in cfg (this will almost always be the case) + if args.logdir is None: + args.logdir = cfg.logging.plots + graph.graph(args.logdir, args.figfile, args.latest_k, args.window) + # # Exports log metadata to CSV # diff --git a/util/listlogs b/util/listlogs old mode 100755 new mode 100644