diff --git a/README.md b/README.md index 92d754e..b838a34 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,33 @@ A next-generation real-time visualization profiler for JVM 21+ environments, foc ## Features -- **Virtual Thread Monitoring**: Track creation, termination, and pinning of virtual threads -- **Memory & GC Monitoring**: Real-time garbage collection tracking with heap usage visualization -- **CPU Monitoring**: JVM and system CPU utilization tracking +### Virtual Thread Monitoring +- **Thread Lifecycle**: Track creation, termination, and pinning of virtual threads +- **Pinning Detection**: Identify pinned threads with detailed stack traces +- **Real-time State Tracking**: Monitor running, pinned, and ended thread states + +### Memory & GC Monitoring +- **GC Events**: Real-time garbage collection tracking with pause time analysis +- **Heap Usage**: Before/after heap visualization with trend analysis +- **Allocation Rate**: Track object allocation rate and top allocating classes +- **Metaspace Monitoring**: Monitor metaspace usage and growth rate +- **GC Overhead**: Calculate GC overhead percentage with warnings + +### CPU & Performance Monitoring +- **CPU Utilization**: JVM and system CPU tracking with history +- **Method Profiling**: Hot method detection via execution sampling +- **Lock Contention**: Monitor thread contention and lock wait times + +### Correlation Analysis +- **GC ↔ CPU Correlation**: Detect CPU spikes related to GC events +- **GC ↔ Pinning Correlation**: Identify pinning increases during GC +- **Automatic Recommendations**: Get actionable insights based on metrics + +### Core Features - **JFR Streaming**: Low-overhead event collection using JDK Flight Recorder - **Real-time Dashboard**: WebSocket-based streaming with interactive charts - **Lock-free Architecture**: High-performance ring buffer for event collection +- **Data Export**: Export events in CSV, JSON, or JSONL formats ## Requirements @@ -78,6 +99,16 @@ The agent accepts the following system properties: | `argus.gc.enabled` | `true` | Enable GC monitoring | | `argus.cpu.enabled` | `true` | Enable CPU monitoring | | `argus.cpu.interval` | `1000` | CPU sampling interval in milliseconds | +| `argus.allocation.enabled` | `false` | Enable allocation tracking (high overhead) | +| `argus.allocation.threshold` | `1048576` | Minimum allocation size to track (1MB) | +| `argus.metaspace.enabled` | `true` | Enable metaspace monitoring | +| `argus.profiling.enabled` | `false` | Enable method profiling (high overhead) | +| `argus.profiling.interval` | `20` | Profiling sampling interval (ms) | +| `argus.contention.enabled` | `false` | Enable lock contention tracking | +| `argus.contention.threshold` | `50` | Minimum contention duration (ms) | +| `argus.correlation.enabled` | `true` | Enable correlation analysis | + +See [Configuration Guide](docs/configuration.md) for detailed documentation. ## Architecture @@ -109,12 +140,17 @@ The agent accepts the following system properties: - `jdk.VirtualThreadPinned` - Pinning detection (critical for Loom performance) - `jdk.VirtualThreadSubmitFailed` - Submit failures -### GC Events +### GC & Memory Events - `jdk.GarbageCollection` - GC pause duration, cause, and type - `jdk.GCHeapSummary` - Heap usage before and after GC +- `jdk.ObjectAllocationInNewTLAB` - Object allocation tracking +- `jdk.MetaspaceSummary` - Metaspace usage monitoring -### CPU Events +### CPU & Performance Events - `jdk.CPULoad` - JVM and system CPU utilization +- `jdk.ExecutionSample` - Method execution sampling for CPU profiling +- `jdk.JavaMonitorEnter` - Lock acquisition contention +- `jdk.JavaMonitorWait` - Lock wait contention ## API Endpoints @@ -127,6 +163,11 @@ The agent accepts the following system properties: | `/cpu-metrics` | CPU utilization history | | `/pinning-analysis` | Pinning hotspot analysis | | `/export` | Export events (CSV, JSON, JSONL) | +| `/allocation-analysis` | Allocation rate and top allocating classes | +| `/metaspace-metrics` | Metaspace usage and growth | +| `/method-profiling` | Hot methods (Top 20) | +| `/contention-analysis` | Lock contention hotspots | +| `/correlation` | Correlation analysis and recommendations | ## Contributing diff --git a/argus-agent/src/main/java/io/argus/agent/ArgusAgent.java b/argus-agent/src/main/java/io/argus/agent/ArgusAgent.java index 2b29674..1f3c4ec 100644 --- a/argus-agent/src/main/java/io/argus/agent/ArgusAgent.java +++ b/argus-agent/src/main/java/io/argus/agent/ArgusAgent.java @@ -3,8 +3,12 @@ import io.argus.agent.config.AgentConfig; import io.argus.agent.jfr.JfrStreamingEngine; import io.argus.core.buffer.RingBuffer; +import io.argus.core.event.AllocationEvent; +import io.argus.core.event.ContentionEvent; import io.argus.core.event.CPUEvent; +import io.argus.core.event.ExecutionSampleEvent; import io.argus.core.event.GCEvent; +import io.argus.core.event.MetaspaceEvent; import io.argus.core.event.VirtualThreadEvent; import io.argus.server.ArgusServer; @@ -49,6 +53,10 @@ public final class ArgusAgent { private static volatile RingBuffer eventBuffer; private static volatile RingBuffer gcEventBuffer; private static volatile RingBuffer cpuEventBuffer; + private static volatile RingBuffer allocationEventBuffer; + private static volatile RingBuffer metaspaceEventBuffer; + private static volatile RingBuffer executionSampleEventBuffer; + private static volatile RingBuffer contentionEventBuffer; private static volatile ArgusServer server; private static volatile AgentConfig config; @@ -96,15 +104,46 @@ private static void initialize(String agentArgs) { cpuEventBuffer = new RingBuffer<>(config.getBufferSize()); } + // Initialize allocation event buffer if enabled + if (config.isAllocationEnabled()) { + allocationEventBuffer = new RingBuffer<>(config.getBufferSize()); + } + + // Initialize metaspace event buffer if enabled + if (config.isMetaspaceEnabled()) { + metaspaceEventBuffer = new RingBuffer<>(config.getBufferSize()); + } + + // Initialize execution sample event buffer if profiling enabled + if (config.isProfilingEnabled()) { + executionSampleEventBuffer = new RingBuffer<>(config.getBufferSize()); + } + + // Initialize contention event buffer if enabled + if (config.isContentionEnabled()) { + contentionEventBuffer = new RingBuffer<>(config.getBufferSize()); + } + // Start JFR streaming engine System.out.println("[Argus] Initializing JFR streaming engine..."); engine = new JfrStreamingEngine( eventBuffer, gcEventBuffer, cpuEventBuffer, + allocationEventBuffer, + metaspaceEventBuffer, + executionSampleEventBuffer, + contentionEventBuffer, config.isGcEnabled(), config.isCpuEnabled(), - config.getCpuIntervalMs() + config.getCpuIntervalMs(), + config.isAllocationEnabled(), + config.getAllocationThreshold(), + config.isMetaspaceEnabled(), + config.isProfilingEnabled(), + config.getProfilingIntervalMs(), + config.isContentionEnabled(), + config.getContentionThresholdMs() ); engine.start(); @@ -121,7 +160,17 @@ private static void initialize(String agentArgs) { } private static void startServer() { - server = new ArgusServer(config.getServerPort(), eventBuffer, gcEventBuffer, cpuEventBuffer); + server = new ArgusServer( + config.getServerPort(), + eventBuffer, + gcEventBuffer, + cpuEventBuffer, + allocationEventBuffer, + metaspaceEventBuffer, + executionSampleEventBuffer, + contentionEventBuffer, + config.isCorrelationEnabled() + ); Thread.ofPlatform() .name("argus-server") .daemon(true) diff --git a/argus-agent/src/main/java/io/argus/agent/config/AgentConfig.java b/argus-agent/src/main/java/io/argus/agent/config/AgentConfig.java index 88e1d41..3f4b815 100644 --- a/argus-agent/src/main/java/io/argus/agent/config/AgentConfig.java +++ b/argus-agent/src/main/java/io/argus/agent/config/AgentConfig.java @@ -14,6 +14,14 @@ *
  • {@code argus.gc.enabled} - Enable GC monitoring (default: true)
  • *
  • {@code argus.cpu.enabled} - Enable CPU monitoring (default: true)
  • *
  • {@code argus.cpu.interval} - CPU sampling interval in ms (default: 1000)
  • + *
  • {@code argus.allocation.enabled} - Enable allocation tracking (default: false, high overhead)
  • + *
  • {@code argus.allocation.threshold} - Minimum allocation size to track in bytes (default: 1MB)
  • + *
  • {@code argus.metaspace.enabled} - Enable metaspace monitoring (default: true)
  • + *
  • {@code argus.profiling.enabled} - Enable method profiling (default: false, high overhead)
  • + *
  • {@code argus.profiling.interval} - Profiling sampling interval in ms (default: 20)
  • + *
  • {@code argus.contention.enabled} - Enable lock contention tracking (default: false)
  • + *
  • {@code argus.contention.threshold} - Minimum contention time to track in ms (default: 50)
  • + *
  • {@code argus.correlation.enabled} - Enable correlation analysis (default: true)
  • * */ public final class AgentConfig { @@ -24,6 +32,14 @@ public final class AgentConfig { private static final boolean DEFAULT_GC_ENABLED = true; private static final boolean DEFAULT_CPU_ENABLED = true; private static final int DEFAULT_CPU_INTERVAL_MS = 1000; + private static final boolean DEFAULT_ALLOCATION_ENABLED = false; // High overhead, opt-in only + private static final int DEFAULT_ALLOCATION_THRESHOLD = 1024 * 1024; // 1MB minimum + private static final boolean DEFAULT_METASPACE_ENABLED = true; + private static final boolean DEFAULT_PROFILING_ENABLED = false; + private static final int DEFAULT_PROFILING_INTERVAL_MS = 20; + private static final boolean DEFAULT_CONTENTION_ENABLED = false; // Can generate many events, opt-in + private static final int DEFAULT_CONTENTION_THRESHOLD_MS = 50; // Higher threshold for less noise + private static final boolean DEFAULT_CORRELATION_ENABLED = true; private final int bufferSize; private final int serverPort; @@ -31,15 +47,35 @@ public final class AgentConfig { private final boolean gcEnabled; private final boolean cpuEnabled; private final int cpuIntervalMs; + private final boolean allocationEnabled; + private final int allocationThreshold; + private final boolean metaspaceEnabled; + private final boolean profilingEnabled; + private final int profilingIntervalMs; + private final boolean contentionEnabled; + private final int contentionThresholdMs; + private final boolean correlationEnabled; private AgentConfig(int bufferSize, int serverPort, boolean serverEnabled, - boolean gcEnabled, boolean cpuEnabled, int cpuIntervalMs) { + boolean gcEnabled, boolean cpuEnabled, int cpuIntervalMs, + boolean allocationEnabled, int allocationThreshold, + boolean metaspaceEnabled, boolean profilingEnabled, + int profilingIntervalMs, boolean contentionEnabled, + int contentionThresholdMs, boolean correlationEnabled) { this.bufferSize = bufferSize; this.serverPort = serverPort; this.serverEnabled = serverEnabled; this.gcEnabled = gcEnabled; this.cpuEnabled = cpuEnabled; this.cpuIntervalMs = cpuIntervalMs; + this.allocationEnabled = allocationEnabled; + this.allocationThreshold = allocationThreshold; + this.metaspaceEnabled = metaspaceEnabled; + this.profilingEnabled = profilingEnabled; + this.profilingIntervalMs = profilingIntervalMs; + this.contentionEnabled = contentionEnabled; + this.contentionThresholdMs = contentionThresholdMs; + this.correlationEnabled = correlationEnabled; } /** @@ -57,8 +93,23 @@ public static AgentConfig fromSystemProperties() { boolean cpuEnabled = Boolean.parseBoolean( System.getProperty("argus.cpu.enabled", String.valueOf(DEFAULT_CPU_ENABLED))); int cpuIntervalMs = Integer.getInteger("argus.cpu.interval", DEFAULT_CPU_INTERVAL_MS); + boolean allocationEnabled = Boolean.parseBoolean( + System.getProperty("argus.allocation.enabled", String.valueOf(DEFAULT_ALLOCATION_ENABLED))); + int allocationThreshold = Integer.getInteger("argus.allocation.threshold", DEFAULT_ALLOCATION_THRESHOLD); + boolean metaspaceEnabled = Boolean.parseBoolean( + System.getProperty("argus.metaspace.enabled", String.valueOf(DEFAULT_METASPACE_ENABLED))); + boolean profilingEnabled = Boolean.parseBoolean( + System.getProperty("argus.profiling.enabled", String.valueOf(DEFAULT_PROFILING_ENABLED))); + int profilingIntervalMs = Integer.getInteger("argus.profiling.interval", DEFAULT_PROFILING_INTERVAL_MS); + boolean contentionEnabled = Boolean.parseBoolean( + System.getProperty("argus.contention.enabled", String.valueOf(DEFAULT_CONTENTION_ENABLED))); + int contentionThresholdMs = Integer.getInteger("argus.contention.threshold", DEFAULT_CONTENTION_THRESHOLD_MS); + boolean correlationEnabled = Boolean.parseBoolean( + System.getProperty("argus.correlation.enabled", String.valueOf(DEFAULT_CORRELATION_ENABLED))); - return new AgentConfig(bufferSize, serverPort, serverEnabled, gcEnabled, cpuEnabled, cpuIntervalMs); + return new AgentConfig(bufferSize, serverPort, serverEnabled, gcEnabled, cpuEnabled, cpuIntervalMs, + allocationEnabled, allocationThreshold, metaspaceEnabled, profilingEnabled, profilingIntervalMs, + contentionEnabled, contentionThresholdMs, correlationEnabled); } /** @@ -68,7 +119,10 @@ public static AgentConfig fromSystemProperties() { */ public static AgentConfig defaults() { return new AgentConfig(DEFAULT_BUFFER_SIZE, DEFAULT_SERVER_PORT, DEFAULT_SERVER_ENABLED, - DEFAULT_GC_ENABLED, DEFAULT_CPU_ENABLED, DEFAULT_CPU_INTERVAL_MS); + DEFAULT_GC_ENABLED, DEFAULT_CPU_ENABLED, DEFAULT_CPU_INTERVAL_MS, + DEFAULT_ALLOCATION_ENABLED, DEFAULT_ALLOCATION_THRESHOLD, DEFAULT_METASPACE_ENABLED, + DEFAULT_PROFILING_ENABLED, DEFAULT_PROFILING_INTERVAL_MS, DEFAULT_CONTENTION_ENABLED, + DEFAULT_CONTENTION_THRESHOLD_MS, DEFAULT_CORRELATION_ENABLED); } /** @@ -104,6 +158,38 @@ public int getCpuIntervalMs() { return cpuIntervalMs; } + public boolean isAllocationEnabled() { + return allocationEnabled; + } + + public int getAllocationThreshold() { + return allocationThreshold; + } + + public boolean isMetaspaceEnabled() { + return metaspaceEnabled; + } + + public boolean isProfilingEnabled() { + return profilingEnabled; + } + + public int getProfilingIntervalMs() { + return profilingIntervalMs; + } + + public boolean isContentionEnabled() { + return contentionEnabled; + } + + public int getContentionThresholdMs() { + return contentionThresholdMs; + } + + public boolean isCorrelationEnabled() { + return correlationEnabled; + } + @Override public String toString() { return "AgentConfig{" + @@ -113,6 +199,14 @@ public String toString() { ", gcEnabled=" + gcEnabled + ", cpuEnabled=" + cpuEnabled + ", cpuIntervalMs=" + cpuIntervalMs + + ", allocationEnabled=" + allocationEnabled + + ", allocationThreshold=" + allocationThreshold + + ", metaspaceEnabled=" + metaspaceEnabled + + ", profilingEnabled=" + profilingEnabled + + ", profilingIntervalMs=" + profilingIntervalMs + + ", contentionEnabled=" + contentionEnabled + + ", contentionThresholdMs=" + contentionThresholdMs + + ", correlationEnabled=" + correlationEnabled + '}'; } @@ -126,6 +220,14 @@ public static final class Builder { private boolean gcEnabled = DEFAULT_GC_ENABLED; private boolean cpuEnabled = DEFAULT_CPU_ENABLED; private int cpuIntervalMs = DEFAULT_CPU_INTERVAL_MS; + private boolean allocationEnabled = DEFAULT_ALLOCATION_ENABLED; + private int allocationThreshold = DEFAULT_ALLOCATION_THRESHOLD; + private boolean metaspaceEnabled = DEFAULT_METASPACE_ENABLED; + private boolean profilingEnabled = DEFAULT_PROFILING_ENABLED; + private int profilingIntervalMs = DEFAULT_PROFILING_INTERVAL_MS; + private boolean contentionEnabled = DEFAULT_CONTENTION_ENABLED; + private int contentionThresholdMs = DEFAULT_CONTENTION_THRESHOLD_MS; + private boolean correlationEnabled = DEFAULT_CORRELATION_ENABLED; private Builder() { } @@ -160,9 +262,52 @@ public Builder cpuIntervalMs(int cpuIntervalMs) { return this; } + public Builder allocationEnabled(boolean allocationEnabled) { + this.allocationEnabled = allocationEnabled; + return this; + } + + public Builder allocationThreshold(int allocationThreshold) { + this.allocationThreshold = allocationThreshold; + return this; + } + + public Builder metaspaceEnabled(boolean metaspaceEnabled) { + this.metaspaceEnabled = metaspaceEnabled; + return this; + } + + public Builder profilingEnabled(boolean profilingEnabled) { + this.profilingEnabled = profilingEnabled; + return this; + } + + public Builder profilingIntervalMs(int profilingIntervalMs) { + this.profilingIntervalMs = profilingIntervalMs; + return this; + } + + public Builder contentionEnabled(boolean contentionEnabled) { + this.contentionEnabled = contentionEnabled; + return this; + } + + public Builder contentionThresholdMs(int contentionThresholdMs) { + this.contentionThresholdMs = contentionThresholdMs; + return this; + } + + public Builder correlationEnabled(boolean correlationEnabled) { + this.correlationEnabled = correlationEnabled; + return this; + } + public AgentConfig build() { return new AgentConfig(bufferSize, serverPort, serverEnabled, - gcEnabled, cpuEnabled, cpuIntervalMs); + gcEnabled, cpuEnabled, cpuIntervalMs, allocationEnabled, + allocationThreshold, metaspaceEnabled, profilingEnabled, + profilingIntervalMs, contentionEnabled, contentionThresholdMs, + correlationEnabled); } } } diff --git a/argus-agent/src/main/java/io/argus/agent/jfr/AllocationEventExtractor.java b/argus-agent/src/main/java/io/argus/agent/jfr/AllocationEventExtractor.java new file mode 100644 index 0000000..670298f --- /dev/null +++ b/argus-agent/src/main/java/io/argus/agent/jfr/AllocationEventExtractor.java @@ -0,0 +1,106 @@ +package io.argus.agent.jfr; + +import io.argus.core.event.AllocationEvent; +import jdk.jfr.consumer.RecordedEvent; + +import java.time.Instant; + +/** + * Extracts allocation event data from JFR RecordedEvent objects. + * + *

    This class handles extraction of object allocation information + * from the {@code jdk.ObjectAllocationInNewTLAB} JFR event. + */ +public final class AllocationEventExtractor { + + /** + * Extracts an AllocationEvent from a jdk.ObjectAllocationInNewTLAB JFR event. + * + * @param event the JFR event + * @return the extracted AllocationEvent + */ + public AllocationEvent extractAllocation(RecordedEvent event) { + Instant timestamp = event.getStartTime(); + String className = extractClassName(event); + long allocationSize = extractAllocationSize(event); + long tlabSize = extractTlabSize(event); + + return AllocationEvent.of(timestamp, className, allocationSize, tlabSize); + } + + private String extractClassName(RecordedEvent event) { + // Try objectClass field (contains RecordedClass) + try { + var objectClass = event.getClass("objectClass"); + if (objectClass != null) { + return objectClass.getName(); + } + } catch (Exception ignored) { + } + + // Try class field + try { + return event.getString("class"); + } catch (Exception ignored) { + } + + // Try className field + try { + return event.getString("className"); + } catch (Exception ignored) { + } + + return "Unknown"; + } + + private long extractAllocationSize(RecordedEvent event) { + // Try allocationSize field + try { + return event.getLong("allocationSize"); + } catch (Exception ignored) { + } + + // Try objectSize field + try { + return event.getLong("objectSize"); + } catch (Exception ignored) { + } + + return 0; + } + + private long extractTlabSize(RecordedEvent event) { + // Try tlabSize field + try { + return event.getLong("tlabSize"); + } catch (Exception ignored) { + } + + // Try tlab field + try { + return event.getLong("tlab"); + } catch (Exception ignored) { + } + + return 0; + } + + /** + * Debug method to print all available fields in an allocation JFR event. + * + * @param event the JFR event + */ + public void debugPrintFields(RecordedEvent event) { + System.out.println("[Argus Debug] Allocation Event: " + event.getEventType().getName()); + event.getFields().forEach(field -> { + try { + Object value = event.getValue(field.getName()); + System.out.printf(" %s (%s) = %s%n", + field.getName(), field.getTypeName(), value); + } catch (Exception e) { + System.out.printf(" %s (%s) = ERROR: %s%n", + field.getName(), field.getTypeName(), e.getMessage()); + } + }); + } +} diff --git a/argus-agent/src/main/java/io/argus/agent/jfr/ContentionEventExtractor.java b/argus-agent/src/main/java/io/argus/agent/jfr/ContentionEventExtractor.java new file mode 100644 index 0000000..c5adf38 --- /dev/null +++ b/argus-agent/src/main/java/io/argus/agent/jfr/ContentionEventExtractor.java @@ -0,0 +1,127 @@ +package io.argus.agent.jfr; + +import io.argus.core.event.ContentionEvent; +import jdk.jfr.consumer.RecordedEvent; + +import java.time.Instant; + +/** + * Extracts contention event data from JFR RecordedEvent objects. + * + *

    This class handles extraction of thread contention information + * from the following JFR events: + *

      + *
    • {@code jdk.JavaMonitorEnter} - Monitor enter events
    • + *
    • {@code jdk.JavaMonitorWait} - Monitor wait events
    • + *
    + */ +public final class ContentionEventExtractor { + + /** + * Extracts a ContentionEvent from a jdk.JavaMonitorEnter JFR event. + * + * @param event the JFR event + * @return the extracted ContentionEvent + */ + public ContentionEvent extractMonitorEnter(RecordedEvent event) { + Instant timestamp = event.getStartTime(); + long threadId = extractThreadId(event); + String threadName = extractThreadName(event); + String monitorClass = extractMonitorClass(event); + long durationNanos = event.getDuration().toNanos(); + + return ContentionEvent.enter(timestamp, threadId, threadName, monitorClass, durationNanos); + } + + /** + * Extracts a ContentionEvent from a jdk.JavaMonitorWait JFR event. + * + * @param event the JFR event + * @return the extracted ContentionEvent + */ + public ContentionEvent extractMonitorWait(RecordedEvent event) { + Instant timestamp = event.getStartTime(); + long threadId = extractThreadId(event); + String threadName = extractThreadName(event); + String monitorClass = extractMonitorClass(event); + long durationNanos = event.getDuration().toNanos(); + + return ContentionEvent.wait(timestamp, threadId, threadName, monitorClass, durationNanos); + } + + private long extractThreadId(RecordedEvent event) { + // Try eventThread.javaThreadId + try { + return event.getLong("eventThread.javaThreadId"); + } catch (Exception ignored) { + } + + // Try thread.javaThreadId + try { + return event.getLong("thread.javaThreadId"); + } catch (Exception ignored) { + } + + return 0; + } + + private String extractThreadName(RecordedEvent event) { + // Try eventThread.javaName + try { + return event.getString("eventThread.javaName"); + } catch (Exception ignored) { + } + + // Try thread.name + try { + return event.getString("thread.name"); + } catch (Exception ignored) { + } + + return "Unknown"; + } + + private String extractMonitorClass(RecordedEvent event) { + // Try monitorClass field (contains RecordedClass) + try { + var monitorClass = event.getClass("monitorClass"); + if (monitorClass != null) { + return monitorClass.getName(); + } + } catch (Exception ignored) { + } + + // Try monitor.class + try { + return event.getString("monitor.class"); + } catch (Exception ignored) { + } + + // Try class field + try { + return event.getString("class"); + } catch (Exception ignored) { + } + + return "Unknown"; + } + + /** + * Debug method to print all available fields in a contention JFR event. + * + * @param event the JFR event + */ + public void debugPrintFields(RecordedEvent event) { + System.out.println("[Argus Debug] Contention Event: " + event.getEventType().getName()); + event.getFields().forEach(field -> { + try { + Object value = event.getValue(field.getName()); + System.out.printf(" %s (%s) = %s%n", + field.getName(), field.getTypeName(), value); + } catch (Exception e) { + System.out.printf(" %s (%s) = ERROR: %s%n", + field.getName(), field.getTypeName(), e.getMessage()); + } + }); + } +} diff --git a/argus-agent/src/main/java/io/argus/agent/jfr/ExecutionSampleExtractor.java b/argus-agent/src/main/java/io/argus/agent/jfr/ExecutionSampleExtractor.java new file mode 100644 index 0000000..4bd605a --- /dev/null +++ b/argus-agent/src/main/java/io/argus/agent/jfr/ExecutionSampleExtractor.java @@ -0,0 +1,162 @@ +package io.argus.agent.jfr; + +import io.argus.core.event.ExecutionSampleEvent; +import jdk.jfr.consumer.RecordedEvent; +import jdk.jfr.consumer.RecordedFrame; +import jdk.jfr.consumer.RecordedStackTrace; + +import java.time.Instant; + +/** + * Extracts execution sample event data from JFR RecordedEvent objects. + * + *

    This class handles extraction of execution sample information + * from the {@code jdk.ExecutionSample} JFR event for CPU profiling. + */ +public final class ExecutionSampleExtractor { + + /** + * Extracts an ExecutionSampleEvent from a jdk.ExecutionSample JFR event. + * + * @param event the JFR event + * @return the extracted ExecutionSampleEvent, or null if stack trace is empty + */ + public ExecutionSampleEvent extractExecutionSample(RecordedEvent event) { + RecordedStackTrace stackTrace = event.getStackTrace(); + if (stackTrace == null || stackTrace.getFrames().isEmpty()) { + return null; + } + + Instant timestamp = event.getStartTime(); + long threadId = extractThreadId(event); + String threadName = extractThreadName(event); + + // Get the top frame + RecordedFrame topFrame = stackTrace.getFrames().getFirst(); + String methodName = extractMethodName(topFrame); + String className = extractClassName(topFrame); + int lineNumber = topFrame.getLineNumber(); + + String fullStackTrace = formatStackTrace(stackTrace); + + return ExecutionSampleEvent.of(timestamp, threadId, threadName, + methodName, className, lineNumber, fullStackTrace); + } + + private long extractThreadId(RecordedEvent event) { + // Try sampledThread.javaThreadId + try { + return event.getLong("sampledThread.javaThreadId"); + } catch (Exception ignored) { + } + + // Try eventThread.javaThreadId + try { + return event.getLong("eventThread.javaThreadId"); + } catch (Exception ignored) { + } + + // Try thread.javaThreadId + try { + return event.getLong("thread.javaThreadId"); + } catch (Exception ignored) { + } + + return 0; + } + + private String extractThreadName(RecordedEvent event) { + // Try sampledThread.javaName + try { + var thread = event.getValue("sampledThread"); + if (thread != null) { + return event.getString("sampledThread.javaName"); + } + } catch (Exception ignored) { + } + + // Try eventThread.javaName + try { + return event.getString("eventThread.javaName"); + } catch (Exception ignored) { + } + + // Try thread.name + try { + return event.getString("thread.name"); + } catch (Exception ignored) { + } + + return "Unknown"; + } + + private String extractMethodName(RecordedFrame frame) { + try { + var method = frame.getMethod(); + if (method != null) { + return method.getName(); + } + } catch (Exception ignored) { + } + return "unknown"; + } + + private String extractClassName(RecordedFrame frame) { + try { + var method = frame.getMethod(); + if (method != null && method.getType() != null) { + return method.getType().getName(); + } + } catch (Exception ignored) { + } + return "Unknown"; + } + + private String formatStackTrace(RecordedStackTrace stackTrace) { + if (stackTrace == null) { + return ""; + } + + StringBuilder sb = new StringBuilder(); + for (RecordedFrame frame : stackTrace.getFrames()) { + try { + var method = frame.getMethod(); + if (method != null) { + String className = method.getType() != null ? method.getType().getName() : "Unknown"; + String methodName = method.getName(); + int lineNumber = frame.getLineNumber(); + + sb.append(" at ").append(className).append(".") + .append(methodName).append("("); + if (lineNumber >= 0) { + sb.append("line:").append(lineNumber); + } else { + sb.append("Unknown Source"); + } + sb.append(")\n"); + } + } catch (Exception ignored) { + } + } + return sb.toString(); + } + + /** + * Debug method to print all available fields in an execution sample JFR event. + * + * @param event the JFR event + */ + public void debugPrintFields(RecordedEvent event) { + System.out.println("[Argus Debug] Execution Sample Event: " + event.getEventType().getName()); + event.getFields().forEach(field -> { + try { + Object value = event.getValue(field.getName()); + System.out.printf(" %s (%s) = %s%n", + field.getName(), field.getTypeName(), value); + } catch (Exception e) { + System.out.printf(" %s (%s) = ERROR: %s%n", + field.getName(), field.getTypeName(), e.getMessage()); + } + }); + } +} diff --git a/argus-agent/src/main/java/io/argus/agent/jfr/JfrStreamingEngine.java b/argus-agent/src/main/java/io/argus/agent/jfr/JfrStreamingEngine.java index 724e414..883e535 100644 --- a/argus-agent/src/main/java/io/argus/agent/jfr/JfrStreamingEngine.java +++ b/argus-agent/src/main/java/io/argus/agent/jfr/JfrStreamingEngine.java @@ -1,8 +1,12 @@ package io.argus.agent.jfr; import io.argus.core.buffer.RingBuffer; +import io.argus.core.event.AllocationEvent; +import io.argus.core.event.ContentionEvent; import io.argus.core.event.CPUEvent; +import io.argus.core.event.ExecutionSampleEvent; import io.argus.core.event.GCEvent; +import io.argus.core.event.MetaspaceEvent; import io.argus.core.event.VirtualThreadEvent; import jdk.jfr.consumer.RecordedEvent; @@ -32,6 +36,11 @@ *

  • {@code jdk.GarbageCollection} - GC pause events
  • *
  • {@code jdk.GCHeapSummary} - Heap usage summary
  • *
  • {@code jdk.CPULoad} - CPU load metrics
  • + *
  • {@code jdk.ObjectAllocationInNewTLAB} - Object allocation events
  • + *
  • {@code jdk.MetaspaceSummary} - Metaspace usage
  • + *
  • {@code jdk.ExecutionSample} - CPU profiling samples
  • + *
  • {@code jdk.JavaMonitorEnter} - Lock contention
  • + *
  • {@code jdk.JavaMonitorWait} - Lock wait events
  • * * * @see JfrEventExtractor @@ -51,22 +60,55 @@ public final class JfrStreamingEngine { // CPU events private static final String EVENT_CPU_LOAD = "jdk.CPULoad"; + // Allocation events + private static final String EVENT_ALLOCATION_TLAB = "jdk.ObjectAllocationInNewTLAB"; + private static final String EVENT_ALLOCATION_OUTSIDE_TLAB = "jdk.ObjectAllocationOutsideTLAB"; + private static final String EVENT_METASPACE = "jdk.MetaspaceSummary"; + + // Profiling events + private static final String EVENT_EXECUTION_SAMPLE = "jdk.ExecutionSample"; + + // Contention events + private static final String EVENT_MONITOR_ENTER = "jdk.JavaMonitorEnter"; + private static final String EVENT_MONITOR_WAIT = "jdk.JavaMonitorWait"; + private final RingBuffer eventBuffer; private final RingBuffer gcEventBuffer; private final RingBuffer cpuEventBuffer; + private final RingBuffer allocationEventBuffer; + private final RingBuffer metaspaceEventBuffer; + private final RingBuffer executionSampleEventBuffer; + private final RingBuffer contentionEventBuffer; + private final JfrEventExtractor extractor; private final GCEventExtractor gcExtractor; private final CPUEventExtractor cpuExtractor; + private final AllocationEventExtractor allocationExtractor; + private final MetaspaceEventExtractor metaspaceExtractor; + private final ExecutionSampleExtractor executionSampleExtractor; + private final ContentionEventExtractor contentionExtractor; + private final AtomicBoolean running = new AtomicBoolean(false); private final AtomicLong eventsProcessed = new AtomicLong(0); private final AtomicLong gcEventsProcessed = new AtomicLong(0); private final AtomicLong cpuEventsProcessed = new AtomicLong(0); + private final AtomicLong allocationEventsProcessed = new AtomicLong(0); + private final AtomicLong metaspaceEventsProcessed = new AtomicLong(0); + private final AtomicLong executionSampleEventsProcessed = new AtomicLong(0); + private final AtomicLong contentionEventsProcessed = new AtomicLong(0); private final CountDownLatch startedLatch = new CountDownLatch(1); // Configuration private final boolean gcEnabled; private final boolean cpuEnabled; private final int cpuIntervalMs; + private final boolean allocationEnabled; + private final int allocationThreshold; + private final boolean metaspaceEnabled; + private final boolean profilingEnabled; + private final int profilingIntervalMs; + private final boolean contentionEnabled; + private final int contentionThresholdMs; // Track thread start times for duration calculation private final Map threadStartTimes = new ConcurrentHashMap<>(); @@ -80,11 +122,12 @@ public final class JfrStreamingEngine { * @param eventBuffer the ring buffer to write events to */ public JfrStreamingEngine(RingBuffer eventBuffer) { - this(eventBuffer, null, null, false, false, 1000); + this(eventBuffer, null, null, null, null, null, null, + false, false, 1000, false, 1024, false, false, 20, false, 10); } /** - * Creates a new JFR streaming engine with full event capture support. + * Creates a new JFR streaming engine with basic event capture support (backward compatible). * * @param eventBuffer the ring buffer for virtual thread events * @param gcEventBuffer the ring buffer for GC events (can be null if gcEnabled is false) @@ -99,15 +142,74 @@ public JfrStreamingEngine(RingBuffer eventBuffer, boolean gcEnabled, boolean cpuEnabled, int cpuIntervalMs) { + this(eventBuffer, gcEventBuffer, cpuEventBuffer, null, null, null, null, + gcEnabled, cpuEnabled, cpuIntervalMs, false, 1024, false, false, 20, false, 10); + } + + /** + * Creates a new JFR streaming engine with full event capture support. + * + * @param eventBuffer the ring buffer for virtual thread events + * @param gcEventBuffer the ring buffer for GC events + * @param cpuEventBuffer the ring buffer for CPU events + * @param allocationEventBuffer the ring buffer for allocation events + * @param metaspaceEventBuffer the ring buffer for metaspace events + * @param executionSampleBuffer the ring buffer for execution sample events + * @param contentionEventBuffer the ring buffer for contention events + * @param gcEnabled whether to capture GC events + * @param cpuEnabled whether to capture CPU events + * @param cpuIntervalMs CPU sampling interval in milliseconds + * @param allocationEnabled whether to capture allocation events + * @param allocationThreshold minimum allocation size to track + * @param metaspaceEnabled whether to capture metaspace events + * @param profilingEnabled whether to capture execution samples + * @param profilingIntervalMs profiling sampling interval in milliseconds + * @param contentionEnabled whether to capture contention events + * @param contentionThresholdMs minimum contention time to track in ms + */ + public JfrStreamingEngine(RingBuffer eventBuffer, + RingBuffer gcEventBuffer, + RingBuffer cpuEventBuffer, + RingBuffer allocationEventBuffer, + RingBuffer metaspaceEventBuffer, + RingBuffer executionSampleBuffer, + RingBuffer contentionEventBuffer, + boolean gcEnabled, + boolean cpuEnabled, + int cpuIntervalMs, + boolean allocationEnabled, + int allocationThreshold, + boolean metaspaceEnabled, + boolean profilingEnabled, + int profilingIntervalMs, + boolean contentionEnabled, + int contentionThresholdMs) { this.eventBuffer = eventBuffer; this.gcEventBuffer = gcEventBuffer; this.cpuEventBuffer = cpuEventBuffer; + this.allocationEventBuffer = allocationEventBuffer; + this.metaspaceEventBuffer = metaspaceEventBuffer; + this.executionSampleEventBuffer = executionSampleBuffer; + this.contentionEventBuffer = contentionEventBuffer; + this.gcEnabled = gcEnabled; this.cpuEnabled = cpuEnabled; this.cpuIntervalMs = cpuIntervalMs; + this.allocationEnabled = allocationEnabled; + this.allocationThreshold = allocationThreshold; + this.metaspaceEnabled = metaspaceEnabled; + this.profilingEnabled = profilingEnabled; + this.profilingIntervalMs = profilingIntervalMs; + this.contentionEnabled = contentionEnabled; + this.contentionThresholdMs = contentionThresholdMs; + this.extractor = new JfrEventExtractor(); this.gcExtractor = gcEnabled ? new GCEventExtractor() : null; this.cpuExtractor = cpuEnabled ? new CPUEventExtractor() : null; + this.allocationExtractor = allocationEnabled ? new AllocationEventExtractor() : null; + this.metaspaceExtractor = metaspaceEnabled ? new MetaspaceEventExtractor() : null; + this.executionSampleExtractor = profilingEnabled ? new ExecutionSampleExtractor() : null; + this.contentionExtractor = contentionEnabled ? new ContentionEventExtractor() : null; } /** @@ -186,6 +288,32 @@ private void configureEvents(RecordingStream rs) { System.out.printf("[Argus] CPU monitoring enabled (interval: %dms)%n", cpuIntervalMs); } + // Enable allocation events if configured + if (allocationEnabled) { + rs.enable(EVENT_ALLOCATION_TLAB).withoutThreshold(); + rs.enable(EVENT_ALLOCATION_OUTSIDE_TLAB).withoutThreshold(); + System.out.printf("[Argus] Allocation tracking enabled (threshold: %d bytes)%n", allocationThreshold); + } + + // Enable metaspace events if configured + if (metaspaceEnabled) { + rs.enable(EVENT_METASPACE).withoutThreshold(); + System.out.println("[Argus] Metaspace monitoring enabled"); + } + + // Enable profiling events if configured + if (profilingEnabled) { + rs.enable(EVENT_EXECUTION_SAMPLE).withPeriod(Duration.ofMillis(profilingIntervalMs)); + System.out.printf("[Argus] Method profiling enabled (interval: %dms)%n", profilingIntervalMs); + } + + // Enable contention events if configured + if (contentionEnabled) { + rs.enable(EVENT_MONITOR_ENTER).withThreshold(Duration.ofMillis(contentionThresholdMs)); + rs.enable(EVENT_MONITOR_WAIT).withThreshold(Duration.ofMillis(contentionThresholdMs)); + System.out.printf("[Argus] Contention tracking enabled (threshold: %dms)%n", contentionThresholdMs); + } + // Set buffer settings rs.setMaxAge(Duration.ofSeconds(10)); rs.setMaxSize(10 * 1024 * 1024); // 10 MB @@ -208,6 +336,28 @@ private void registerEventHandlers(RecordingStream rs) { if (cpuEnabled) { rs.onEvent(EVENT_CPU_LOAD, this::handleCPULoad); } + + // Allocation event handlers + if (allocationEnabled) { + rs.onEvent(EVENT_ALLOCATION_TLAB, this::handleAllocation); + rs.onEvent(EVENT_ALLOCATION_OUTSIDE_TLAB, this::handleAllocation); + } + + // Metaspace event handlers + if (metaspaceEnabled) { + rs.onEvent(EVENT_METASPACE, this::handleMetaspace); + } + + // Profiling event handlers + if (profilingEnabled) { + rs.onEvent(EVENT_EXECUTION_SAMPLE, this::handleExecutionSample); + } + + // Contention event handlers + if (contentionEnabled) { + rs.onEvent(EVENT_MONITOR_ENTER, this::handleMonitorEnter); + rs.onEvent(EVENT_MONITOR_WAIT, this::handleMonitorWait); + } } private void handleStart(RecordedEvent event) { @@ -295,6 +445,52 @@ private void handleCPULoad(RecordedEvent event) { cpuEventsProcessed.incrementAndGet(); } + private void handleAllocation(RecordedEvent event) { + if (allocationEventBuffer == null || allocationExtractor == null) return; + + AllocationEvent allocationEvent = allocationExtractor.extractAllocation(event); + + // Apply threshold filter + if (allocationEvent.allocationSize() >= allocationThreshold) { + allocationEventBuffer.offer(allocationEvent); + allocationEventsProcessed.incrementAndGet(); + } + } + + private void handleMetaspace(RecordedEvent event) { + if (metaspaceEventBuffer == null || metaspaceExtractor == null) return; + + MetaspaceEvent metaspaceEvent = metaspaceExtractor.extractMetaspace(event); + metaspaceEventBuffer.offer(metaspaceEvent); + metaspaceEventsProcessed.incrementAndGet(); + } + + private void handleExecutionSample(RecordedEvent event) { + if (executionSampleEventBuffer == null || executionSampleExtractor == null) return; + + ExecutionSampleEvent sampleEvent = executionSampleExtractor.extractExecutionSample(event); + if (sampleEvent != null) { + executionSampleEventBuffer.offer(sampleEvent); + executionSampleEventsProcessed.incrementAndGet(); + } + } + + private void handleMonitorEnter(RecordedEvent event) { + if (contentionEventBuffer == null || contentionExtractor == null) return; + + ContentionEvent contentionEvent = contentionExtractor.extractMonitorEnter(event); + contentionEventBuffer.offer(contentionEvent); + contentionEventsProcessed.incrementAndGet(); + } + + private void handleMonitorWait(RecordedEvent event) { + if (contentionEventBuffer == null || contentionExtractor == null) return; + + ContentionEvent contentionEvent = contentionExtractor.extractMonitorWait(event); + contentionEventBuffer.offer(contentionEvent); + contentionEventsProcessed.incrementAndGet(); + } + /** * Stops the JFR streaming engine. */ @@ -373,4 +569,76 @@ public boolean isGcEnabled() { public boolean isCpuEnabled() { return cpuEnabled; } + + /** + * Returns the total number of allocation events processed. + * + * @return allocation event count + */ + public long getAllocationEventsProcessed() { + return allocationEventsProcessed.get(); + } + + /** + * Returns the total number of metaspace events processed. + * + * @return metaspace event count + */ + public long getMetaspaceEventsProcessed() { + return metaspaceEventsProcessed.get(); + } + + /** + * Returns the total number of execution sample events processed. + * + * @return execution sample event count + */ + public long getExecutionSampleEventsProcessed() { + return executionSampleEventsProcessed.get(); + } + + /** + * Returns the total number of contention events processed. + * + * @return contention event count + */ + public long getContentionEventsProcessed() { + return contentionEventsProcessed.get(); + } + + /** + * Returns whether allocation tracking is enabled. + * + * @return true if allocation tracking is enabled + */ + public boolean isAllocationEnabled() { + return allocationEnabled; + } + + /** + * Returns whether metaspace monitoring is enabled. + * + * @return true if metaspace monitoring is enabled + */ + public boolean isMetaspaceEnabled() { + return metaspaceEnabled; + } + + /** + * Returns whether method profiling is enabled. + * + * @return true if method profiling is enabled + */ + public boolean isProfilingEnabled() { + return profilingEnabled; + } + + /** + * Returns whether contention tracking is enabled. + * + * @return true if contention tracking is enabled + */ + public boolean isContentionEnabled() { + return contentionEnabled; + } } diff --git a/argus-agent/src/main/java/io/argus/agent/jfr/MetaspaceEventExtractor.java b/argus-agent/src/main/java/io/argus/agent/jfr/MetaspaceEventExtractor.java new file mode 100644 index 0000000..393b6ac --- /dev/null +++ b/argus-agent/src/main/java/io/argus/agent/jfr/MetaspaceEventExtractor.java @@ -0,0 +1,138 @@ +package io.argus.agent.jfr; + +import io.argus.core.event.MetaspaceEvent; +import jdk.jfr.consumer.RecordedEvent; + +import java.time.Instant; + +/** + * Extracts metaspace event data from JFR RecordedEvent objects. + * + *

    This class handles extraction of metaspace information + * from the {@code jdk.MetaspaceSummary} JFR event. + */ +public final class MetaspaceEventExtractor { + + /** + * Extracts a MetaspaceEvent from a jdk.MetaspaceSummary JFR event. + * + * @param event the JFR event + * @return the extracted MetaspaceEvent + */ + public MetaspaceEvent extractMetaspace(RecordedEvent event) { + Instant timestamp = event.getStartTime(); + long used = extractMetaspaceUsed(event); + long committed = extractMetaspaceCommitted(event); + long reserved = extractMetaspaceReserved(event); + long classCount = extractClassCount(event); + + return MetaspaceEvent.of(timestamp, used, committed, reserved, classCount); + } + + private long extractMetaspaceUsed(RecordedEvent event) { + // Try metaspace.used field path + try { + return event.getLong("metaspace.used"); + } catch (Exception ignored) { + } + + // Try dataSpace.used + classSpace.used + try { + long dataUsed = event.getLong("dataSpace.used"); + long classUsed = event.getLong("classSpace.used"); + return dataUsed + classUsed; + } catch (Exception ignored) { + } + + // Try used field + try { + return event.getLong("used"); + } catch (Exception ignored) { + } + + return 0; + } + + private long extractMetaspaceCommitted(RecordedEvent event) { + // Try metaspace.committed field path + try { + return event.getLong("metaspace.committed"); + } catch (Exception ignored) { + } + + // Try dataSpace.committed + classSpace.committed + try { + long dataCommitted = event.getLong("dataSpace.committed"); + long classCommitted = event.getLong("classSpace.committed"); + return dataCommitted + classCommitted; + } catch (Exception ignored) { + } + + // Try committed field + try { + return event.getLong("committed"); + } catch (Exception ignored) { + } + + return 0; + } + + private long extractMetaspaceReserved(RecordedEvent event) { + // Try metaspace.reserved field path + try { + return event.getLong("metaspace.reserved"); + } catch (Exception ignored) { + } + + // Try dataSpace.reserved + classSpace.reserved + try { + long dataReserved = event.getLong("dataSpace.reserved"); + long classReserved = event.getLong("classSpace.reserved"); + return dataReserved + classReserved; + } catch (Exception ignored) { + } + + // Try reserved field + try { + return event.getLong("reserved"); + } catch (Exception ignored) { + } + + return 0; + } + + private long extractClassCount(RecordedEvent event) { + // Try classCount field + try { + return event.getLong("classCount"); + } catch (Exception ignored) { + } + + // Try classLoader.classCount + try { + return event.getLong("classLoader.classCount"); + } catch (Exception ignored) { + } + + return 0; + } + + /** + * Debug method to print all available fields in a metaspace JFR event. + * + * @param event the JFR event + */ + public void debugPrintFields(RecordedEvent event) { + System.out.println("[Argus Debug] Metaspace Event: " + event.getEventType().getName()); + event.getFields().forEach(field -> { + try { + Object value = event.getValue(field.getName()); + System.out.printf(" %s (%s) = %s%n", + field.getName(), field.getTypeName(), value); + } catch (Exception e) { + System.out.printf(" %s (%s) = ERROR: %s%n", + field.getName(), field.getTypeName(), e.getMessage()); + } + }); + } +} diff --git a/argus-core/src/main/java/io/argus/core/event/AllocationEvent.java b/argus-core/src/main/java/io/argus/core/event/AllocationEvent.java new file mode 100644 index 0000000..d7ef8f1 --- /dev/null +++ b/argus-core/src/main/java/io/argus/core/event/AllocationEvent.java @@ -0,0 +1,53 @@ +package io.argus.core.event; + +import java.time.Instant; + +/** + * Represents an object allocation event captured by the Argus agent. + * + *

    This event is generated from {@code jdk.ObjectAllocationInNewTLAB} JFR events + * which track object allocations in new Thread Local Allocation Buffers. + * + * @param timestamp the event timestamp + * @param className the class of the allocated object + * @param allocationSize the size of the allocation in bytes + * @param tlabSize the size of the TLAB in bytes + */ +public record AllocationEvent( + Instant timestamp, + String className, + long allocationSize, + long tlabSize +) { + /** + * Creates an allocation event. + * + * @param timestamp the event timestamp + * @param className the class of the allocated object + * @param allocationSize the size of the allocation in bytes + * @param tlabSize the size of the TLAB in bytes + * @return the allocation event + */ + public static AllocationEvent of(Instant timestamp, String className, + long allocationSize, long tlabSize) { + return new AllocationEvent(timestamp, className, allocationSize, tlabSize); + } + + /** + * Returns the allocation size in KB. + * + * @return allocation size in KB + */ + public double allocationSizeKB() { + return allocationSize / 1024.0; + } + + /** + * Returns the TLAB size in KB. + * + * @return TLAB size in KB + */ + public double tlabSizeKB() { + return tlabSize / 1024.0; + } +} diff --git a/argus-core/src/main/java/io/argus/core/event/ContentionEvent.java b/argus-core/src/main/java/io/argus/core/event/ContentionEvent.java new file mode 100644 index 0000000..6273982 --- /dev/null +++ b/argus-core/src/main/java/io/argus/core/event/ContentionEvent.java @@ -0,0 +1,93 @@ +package io.argus.core.event; + +import java.time.Instant; + +/** + * Represents a thread contention event captured by the Argus agent. + * + *

    This event is generated from JFR events: + *

      + *
    • {@code jdk.JavaMonitorEnter} - Thread attempting to enter a synchronized block
    • + *
    • {@code jdk.JavaMonitorWait} - Thread waiting on a monitor
    • + *
    + * + * @param timestamp the event timestamp + * @param threadId the thread ID experiencing contention + * @param threadName the thread name + * @param monitorClass the class of the monitor object + * @param durationNanos the duration of the contention in nanoseconds + * @param type the type of contention (ENTER or WAIT) + */ +public record ContentionEvent( + Instant timestamp, + long threadId, + String threadName, + String monitorClass, + long durationNanos, + ContentionType type +) { + /** + * Types of contention events. + */ + public enum ContentionType { + /** + * Thread attempting to enter a synchronized block. + */ + ENTER, + + /** + * Thread waiting on a monitor (Object.wait()). + */ + WAIT + } + + /** + * Creates a monitor enter contention event. + * + * @param timestamp the event timestamp + * @param threadId the thread ID + * @param threadName the thread name + * @param monitorClass the monitor class + * @param durationNanos the duration in nanoseconds + * @return the contention event + */ + public static ContentionEvent enter(Instant timestamp, long threadId, String threadName, + String monitorClass, long durationNanos) { + return new ContentionEvent(timestamp, threadId, threadName, monitorClass, + durationNanos, ContentionType.ENTER); + } + + /** + * Creates a monitor wait contention event. + * + * @param timestamp the event timestamp + * @param threadId the thread ID + * @param threadName the thread name + * @param monitorClass the monitor class + * @param durationNanos the duration in nanoseconds + * @return the contention event + */ + public static ContentionEvent wait(Instant timestamp, long threadId, String threadName, + String monitorClass, long durationNanos) { + return new ContentionEvent(timestamp, threadId, threadName, monitorClass, + durationNanos, ContentionType.WAIT); + } + + /** + * Returns the duration in milliseconds. + * + * @return duration in milliseconds + */ + public double durationMs() { + return durationNanos / 1_000_000.0; + } + + /** + * Returns the duration in microseconds. + * + * @return duration in microseconds + */ + public double durationMicros() { + return durationNanos / 1_000.0; + } +} diff --git a/argus-core/src/main/java/io/argus/core/event/EventType.java b/argus-core/src/main/java/io/argus/core/event/EventType.java index f9c1f0e..03624f7 100644 --- a/argus-core/src/main/java/io/argus/core/event/EventType.java +++ b/argus-core/src/main/java/io/argus/core/event/EventType.java @@ -40,7 +40,29 @@ public enum EventType { /** * CPU load metrics. */ - CPU_LOAD(20); + CPU_LOAD(20), + + // Allocation Events (30-39) + /** + * Object allocation in new TLAB. + */ + ALLOCATION(30), + + /** + * Metaspace summary. + */ + METASPACE_SUMMARY(31), + + // Profiling Events (40-49) + /** + * Execution sample for method profiling. + */ + EXECUTION_SAMPLE(40), + + /** + * Thread contention event (lock wait/enter). + */ + CONTENTION(41); private final int code; @@ -61,6 +83,10 @@ public static EventType fromCode(int code) { case 10 -> GC_PAUSE; case 11 -> GC_HEAP_SUMMARY; case 20 -> CPU_LOAD; + case 30 -> ALLOCATION; + case 31 -> METASPACE_SUMMARY; + case 40 -> EXECUTION_SAMPLE; + case 41 -> CONTENTION; default -> throw new IllegalArgumentException("Unknown event type code: " + code); }; } diff --git a/argus-core/src/main/java/io/argus/core/event/ExecutionSampleEvent.java b/argus-core/src/main/java/io/argus/core/event/ExecutionSampleEvent.java new file mode 100644 index 0000000..bd61960 --- /dev/null +++ b/argus-core/src/main/java/io/argus/core/event/ExecutionSampleEvent.java @@ -0,0 +1,65 @@ +package io.argus.core.event; + +import java.time.Instant; + +/** + * Represents an execution sample event captured by the Argus agent. + * + *

    This event is generated from {@code jdk.ExecutionSample} JFR events + * which are periodic CPU samples for profiling hot methods. + * + * @param timestamp the event timestamp + * @param threadId the thread ID being sampled + * @param threadName the thread name + * @param methodName the method name at the top of the stack + * @param className the class name containing the method + * @param lineNumber the line number in the source file + * @param stackTrace the full stack trace + */ +public record ExecutionSampleEvent( + Instant timestamp, + long threadId, + String threadName, + String methodName, + String className, + int lineNumber, + String stackTrace +) { + /** + * Creates an execution sample event. + * + * @param timestamp the event timestamp + * @param threadId the thread ID + * @param threadName the thread name + * @param methodName the method name + * @param className the class name + * @param lineNumber the line number + * @param stackTrace the full stack trace + * @return the execution sample event + */ + public static ExecutionSampleEvent of(Instant timestamp, long threadId, String threadName, + String methodName, String className, int lineNumber, + String stackTrace) { + return new ExecutionSampleEvent(timestamp, threadId, threadName, methodName, + className, lineNumber, stackTrace); + } + + /** + * Returns the fully qualified method name (class.method). + * + * @return fully qualified method name + */ + public String fullyQualifiedMethod() { + return className + "." + methodName; + } + + /** + * Returns the package name from the class name. + * + * @return package name or empty string if no package + */ + public String packageName() { + int lastDot = className.lastIndexOf('.'); + return lastDot > 0 ? className.substring(0, lastDot) : ""; + } +} diff --git a/argus-core/src/main/java/io/argus/core/event/MetaspaceEvent.java b/argus-core/src/main/java/io/argus/core/event/MetaspaceEvent.java new file mode 100644 index 0000000..75ac8a9 --- /dev/null +++ b/argus-core/src/main/java/io/argus/core/event/MetaspaceEvent.java @@ -0,0 +1,67 @@ +package io.argus.core.event; + +import java.time.Instant; + +/** + * Represents a metaspace summary event captured by the Argus agent. + * + *

    This event is generated from {@code jdk.MetaspaceSummary} JFR events + * which track metaspace memory usage. + * + * @param timestamp the event timestamp + * @param metaspaceUsed metaspace used memory in bytes + * @param metaspaceCommitted metaspace committed memory in bytes + * @param metaspaceReserved metaspace reserved memory in bytes + * @param classCount number of loaded classes + */ +public record MetaspaceEvent( + Instant timestamp, + long metaspaceUsed, + long metaspaceCommitted, + long metaspaceReserved, + long classCount +) { + /** + * Creates a metaspace event. + * + * @param timestamp the event timestamp + * @param metaspaceUsed metaspace used memory in bytes + * @param metaspaceCommitted metaspace committed memory in bytes + * @param metaspaceReserved metaspace reserved memory in bytes + * @param classCount number of loaded classes + * @return the metaspace event + */ + public static MetaspaceEvent of(Instant timestamp, long metaspaceUsed, + long metaspaceCommitted, long metaspaceReserved, + long classCount) { + return new MetaspaceEvent(timestamp, metaspaceUsed, metaspaceCommitted, + metaspaceReserved, classCount); + } + + /** + * Returns the metaspace used in MB. + * + * @return metaspace used in MB + */ + public double usedMB() { + return metaspaceUsed / (1024.0 * 1024.0); + } + + /** + * Returns the metaspace committed in MB. + * + * @return metaspace committed in MB + */ + public double committedMB() { + return metaspaceCommitted / (1024.0 * 1024.0); + } + + /** + * Returns the metaspace utilization ratio (used/committed). + * + * @return utilization ratio (0.0-1.0) + */ + public double utilizationRatio() { + return metaspaceCommitted > 0 ? (double) metaspaceUsed / metaspaceCommitted : 0.0; + } +} diff --git a/argus-frontend/src/main/resources/public/css/style.css b/argus-frontend/src/main/resources/public/css/style.css index a1f8516..0518ebb 100644 --- a/argus-frontend/src/main/resources/public/css/style.css +++ b/argus-frontend/src/main/resources/public/css/style.css @@ -134,11 +134,18 @@ main { /* Metrics */ .metrics { display: grid; - grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); - gap: 1rem; + grid-template-columns: repeat(auto-fit, minmax(120px, 1fr)); + gap: 0.75rem; margin-bottom: 1.5rem; } +/* Ensure all 9 cards fit on one row for wider screens */ +@media (min-width: 1200px) { + .metrics { + grid-template-columns: repeat(9, 1fr); + } +} + .metric-card { background-color: var(--bg-secondary); border: 1px solid var(--border-color); diff --git a/argus-frontend/src/main/resources/public/index.html b/argus-frontend/src/main/resources/public/index.html index de22f68..970b8dd 100644 --- a/argus-frontend/src/main/resources/public/index.html +++ b/argus-frontend/src/main/resources/public/index.html @@ -59,6 +59,10 @@

    Heap Used

    CPU (JVM)

    -%
    +
    +

    GC Overhead

    +
    -%
    +
    @@ -150,6 +154,82 @@

    CPU Load

    + +
    +
    +

    Allocation & Metaspace

    +
    + Alloc Rate: - MB/s + Total Allocated: - MB + Metaspace: - MB + Classes: 0 +
    +
    +
    +
    +
    +

    Allocation Rate

    + memory allocation over time +
    +
    + +
    +
    +
    +
    +

    Metaspace Usage

    + metaspace memory over time +
    +
    + +
    +
    +
    +
    + + +
    +
    +

    Profiling & Contention

    +
    + CPU Samples: 0 + Contention Events: 0 + Contention Time: 0ms +
    +
    +
    +
    +
    +

    Hot Methods

    + top CPU consuming methods +
    +
    + +
    +
    +
    +
    +

    Lock Contention

    + top contention hotspots +
    +
    + +
    +
    +
    +
    + + +
    +
    +

    Recommendations

    + +
    +
    +
    No recommendations at this time
    +
    +
    +
    diff --git a/argus-frontend/src/main/resources/public/js/app.js b/argus-frontend/src/main/resources/public/js/app.js index 9e5d512..a64d76e 100644 --- a/argus-frontend/src/main/resources/public/js/app.js +++ b/argus-frontend/src/main/resources/public/js/app.js @@ -5,7 +5,7 @@ * the dashboard functionality. */ import { initWebSocket } from './websocket.js'; -import { initCharts, updateCharts, trackEventForCharts, updateGCCharts, updateCPUCharts } from './charts.js'; +import { initCharts, updateCharts, trackEventForCharts, updateGCCharts, updateCPUCharts, updateAllocationCharts, updateMetaspaceCharts, updateProfilingCharts, updateContentionCharts } from './charts.js'; import { initThreadView, renderThreadStateView, captureAllThreadsDump } from './threads.js'; import { counts, @@ -16,7 +16,12 @@ import { addToDurationBucket, threadStates, gcData, - cpuData + cpuData, + allocationData, + metaspaceData, + profilingData, + contentionData, + correlationData } from './state.js'; import { formatNumber, formatTimestamp, escapeHtml, formatDuration } from './utils.js'; import { initFilters, addEvent as addEventToFilter, clearEvents as clearFilterEvents } from './filter.js'; @@ -122,7 +127,29 @@ const elements = { jvmCpu: document.getElementById('jvm-cpu'), cpuJvmCurrent: document.getElementById('cpu-jvm-current'), cpuSystemCurrent: document.getElementById('cpu-system-current'), - cpuPeakJvm: document.getElementById('cpu-peak-jvm') + cpuPeakJvm: document.getElementById('cpu-peak-jvm'), + + // GC Overhead + gcOverhead: document.getElementById('gc-overhead'), + + // Allocation metrics + allocRate: document.getElementById('alloc-rate'), + allocTotal: document.getElementById('alloc-total'), + metaspaceUsed: document.getElementById('metaspace-used'), + classCount: document.getElementById('class-count'), + allocationRateCanvas: document.getElementById('allocation-rate-chart'), + metaspaceCanvas: document.getElementById('metaspace-chart'), + + // Profiling metrics + cpuSamples: document.getElementById('cpu-samples'), + contentionEvents: document.getElementById('contention-events'), + contentionTime: document.getElementById('contention-time'), + hotMethodsCanvas: document.getElementById('hot-methods-chart'), + contentionCanvas: document.getElementById('contention-chart'), + + // Recommendations + recommendationsList: document.getElementById('recommendations-list'), + refreshRecommendationsBtn: document.getElementById('refresh-recommendations') }; const maxEvents = 500; @@ -139,7 +166,11 @@ function init() { duration: elements.durationCanvas, gcTimeline: elements.gcTimelineCanvas, heap: elements.heapCanvas, - cpu: elements.cpuCanvas + cpu: elements.cpuCanvas, + allocationRate: elements.allocationRateCanvas, + metaspace: elements.metaspaceCanvas, + hotMethods: elements.hotMethodsCanvas, + contention: elements.contentionCanvas }); // Initialize filters @@ -165,6 +196,11 @@ function init() { fetchPinningAnalysis(); fetchGCAnalysis(); fetchCPUMetrics(); + fetchAllocationAnalysis(); + fetchMetaspaceMetrics(); + fetchMethodProfiling(); + fetchContentionAnalysis(); + fetchCorrelation(); // Setup periodic updates setInterval(updateCharts, 1000); @@ -172,6 +208,11 @@ function init() { setInterval(fetchPinningAnalysis, 5000); setInterval(fetchGCAnalysis, 2000); setInterval(fetchCPUMetrics, 1000); + setInterval(fetchAllocationAnalysis, 2000); + setInterval(fetchMetaspaceMetrics, 5000); + setInterval(fetchMethodProfiling, 5000); + setInterval(fetchContentionAnalysis, 5000); + setInterval(fetchCorrelation, 10000); setInterval(() => { renderThreadStateView(elements.threadsContainer, elements.threadCount); }, 1000); @@ -389,6 +430,19 @@ function updateGCDisplay(data) { if (elements.gcMaxPause) { elements.gcMaxPause.textContent = (data.maxPauseTimeMs || 0) + 'ms'; } + if (elements.gcOverhead) { + const overhead = parseFloat(data.gcOverheadPercent) || 0; + elements.gcOverhead.textContent = overhead.toFixed(1) + '%'; + // Add warning class if overhead > 10% + const card = elements.gcOverhead.closest('.metric-card'); + if (card) { + if (data.isOverheadWarning) { + card.classList.add('warning'); + } else { + card.classList.remove('warning'); + } + } + } } function updateCPUDisplay(data) { @@ -608,5 +662,137 @@ function handleExport() { elements.exportModal.classList.add('hidden'); } +async function fetchAllocationAnalysis() { + try { + const response = await fetch('/allocation-analysis'); + if (response.ok) { + const data = await response.json(); + if (!data.error) { + updateAllocationDisplay(data); + updateAllocationCharts(data); + } + } + } catch (e) { + // Allocation tracking might not be enabled + } +} + +async function fetchMetaspaceMetrics() { + try { + const response = await fetch('/metaspace-metrics'); + if (response.ok) { + const data = await response.json(); + if (!data.error) { + updateMetaspaceDisplay(data); + updateMetaspaceCharts(data); + } + } + } catch (e) { + // Metaspace monitoring might not be enabled + } +} + +async function fetchMethodProfiling() { + try { + const response = await fetch('/method-profiling'); + if (response.ok) { + const data = await response.json(); + if (!data.error) { + updateProfilingDisplay(data); + updateProfilingCharts(data); + } + } + } catch (e) { + // Method profiling might not be enabled + } +} + +async function fetchContentionAnalysis() { + try { + const response = await fetch('/contention-analysis'); + if (response.ok) { + const data = await response.json(); + if (!data.error) { + updateContentionDisplay(data); + updateContentionCharts(data); + } + } + } catch (e) { + // Contention tracking might not be enabled + } +} + +async function fetchCorrelation() { + try { + const response = await fetch('/correlation'); + if (response.ok) { + const data = await response.json(); + if (!data.error) { + updateRecommendations(data); + } + } + } catch (e) { + // Correlation analysis might not be enabled + } +} + +function updateAllocationDisplay(data) { + if (elements.allocRate) { + elements.allocRate.textContent = (parseFloat(data.allocationRateMBPerSec) || 0).toFixed(1) + ' MB/s'; + } + if (elements.allocTotal) { + elements.allocTotal.textContent = (parseFloat(data.totalAllocatedMB) || 0).toFixed(1) + ' MB'; + } +} + +function updateMetaspaceDisplay(data) { + if (elements.metaspaceUsed) { + elements.metaspaceUsed.textContent = (parseFloat(data.currentUsedMB) || 0).toFixed(1) + ' MB'; + } + if (elements.classCount) { + elements.classCount.textContent = formatNumber(data.currentClassCount || 0); + } +} + +function updateProfilingDisplay(data) { + if (elements.cpuSamples) { + elements.cpuSamples.textContent = formatNumber(data.totalSamples || 0); + } +} + +function updateContentionDisplay(data) { + if (elements.contentionEvents) { + elements.contentionEvents.textContent = formatNumber(data.totalContentionEvents || 0); + } + if (elements.contentionTime) { + elements.contentionTime.textContent = (data.totalContentionTimeMs || 0) + 'ms'; + } +} + +function updateRecommendations(data) { + if (!elements.recommendationsList) return; + + const recommendations = data.recommendations || []; + + if (recommendations.length === 0) { + elements.recommendationsList.innerHTML = '
    No recommendations at this time
    '; + return; + } + + elements.recommendationsList.innerHTML = recommendations.map(rec => { + const severityClass = rec.severity.toLowerCase(); + return ` +
    +
    + ${rec.type.replace(/_/g, ' ')} + ${rec.severity} +
    +
    ${escapeHtml(rec.title)}
    +
    ${escapeHtml(rec.description)}
    +
    + `; + }).join(''); +} + // Start the application init(); diff --git a/argus-frontend/src/main/resources/public/js/charts.js b/argus-frontend/src/main/resources/public/js/charts.js index feab345..ead2b01 100644 --- a/argus-frontend/src/main/resources/public/js/charts.js +++ b/argus-frontend/src/main/resources/public/js/charts.js @@ -12,7 +12,11 @@ import { updateLastSecondTimestamp, stateCounts, gcData, - cpuData + cpuData, + allocationData, + metaspaceData, + profilingData, + contentionData } from './state.js'; let eventsRateChart = null; @@ -21,6 +25,10 @@ let durationChart = null; let gcTimelineChart = null; let heapChart = null; let cpuChart = null; +let allocationRateChart = null; +let metaspaceChart = null; +let hotMethodsChart = null; +let contentionChart = null; const gridColor = 'rgba(48, 54, 61, 0.8)'; const textColor = '#8b949e'; @@ -297,6 +305,171 @@ export function initCharts(canvases) { } }); } + + // Allocation Rate Chart (Line) + if (canvases.allocationRate) { + allocationRateChart = new Chart(canvases.allocationRate, { + type: 'line', + data: { + labels: allocationData.history.labels, + datasets: [{ + label: 'Allocation Rate (MB/s)', + data: allocationData.history.rates, + borderColor: '#f0883e', + backgroundColor: 'rgba(240, 136, 62, 0.2)', + fill: true, + tension: 0.3, + pointRadius: 0 + }] + }, + options: { + responsive: true, + maintainAspectRatio: false, + animation: { duration: 0 }, + plugins: { legend: { display: false } }, + scales: { + x: { display: false }, + y: { + beginAtZero: true, + grid: { color: gridColor }, + ticks: { + color: textColor, + font: { size: 10 }, + callback: function(value) { + return value.toFixed(1) + ' MB/s'; + } + } + } + } + } + }); + } + + // Metaspace Chart (Line) + if (canvases.metaspace) { + metaspaceChart = new Chart(canvases.metaspace, { + type: 'line', + data: { + labels: metaspaceData.history.labels, + datasets: [ + { + label: 'Used', + data: metaspaceData.history.used, + borderColor: '#a371f7', + backgroundColor: 'rgba(163, 113, 247, 0.2)', + fill: true, + tension: 0.3, + pointRadius: 0 + }, + { + label: 'Committed', + data: metaspaceData.history.committed, + borderColor: '#8b949e', + backgroundColor: 'rgba(139, 148, 158, 0.1)', + fill: true, + tension: 0.3, + pointRadius: 0, + borderDash: [5, 5] + } + ] + }, + options: { + responsive: true, + maintainAspectRatio: false, + animation: { duration: 0 }, + plugins: { + legend: { + position: 'top', + labels: { color: textColor, boxWidth: 12, padding: 8, font: { size: 10 } } + } + }, + scales: { + x: { display: false }, + y: { + beginAtZero: true, + grid: { color: gridColor }, + ticks: { + color: textColor, + font: { size: 10 }, + callback: function(value) { + return value.toFixed(0) + ' MB'; + } + } + } + } + } + }); + } + + // Hot Methods Chart (Bar) + if (canvases.hotMethods) { + hotMethodsChart = new Chart(canvases.hotMethods, { + type: 'bar', + data: { + labels: [], + datasets: [{ + label: 'CPU Samples', + data: [], + backgroundColor: 'rgba(88, 166, 255, 0.7)', + borderColor: '#58a6ff', + borderWidth: 1 + }] + }, + options: { + indexAxis: 'y', + responsive: true, + maintainAspectRatio: false, + animation: { duration: 0 }, + plugins: { legend: { display: false } }, + scales: { + x: { + beginAtZero: true, + grid: { color: gridColor }, + ticks: { color: textColor, font: { size: 10 } } + }, + y: { + grid: { display: false }, + ticks: { color: textColor, font: { size: 9 } } + } + } + } + }); + } + + // Contention Chart (Bar) + if (canvases.contention) { + contentionChart = new Chart(canvases.contention, { + type: 'bar', + data: { + labels: [], + datasets: [{ + label: 'Contention Time (ms)', + data: [], + backgroundColor: 'rgba(248, 81, 73, 0.7)', + borderColor: '#f85149', + borderWidth: 1 + }] + }, + options: { + indexAxis: 'y', + responsive: true, + maintainAspectRatio: false, + animation: { duration: 0 }, + plugins: { legend: { display: false } }, + scales: { + x: { + beginAtZero: true, + grid: { color: gridColor }, + ticks: { color: textColor, font: { size: 10 } } + }, + y: { + grid: { display: false }, + ticks: { color: textColor, font: { size: 9 } } + } + } + } + }); + } } /** @@ -356,6 +529,10 @@ export function updateCharts() { if (gcTimelineChart) gcTimelineChart.update('none'); if (heapChart) heapChart.update('none'); if (cpuChart) cpuChart.update('none'); + if (allocationRateChart) allocationRateChart.update('none'); + if (metaspaceChart) metaspaceChart.update('none'); + if (hotMethodsChart) hotMethodsChart.update('none'); + if (contentionChart) contentionChart.update('none'); } /** @@ -445,3 +622,108 @@ export function trackEventForCharts(event) { currentSecondEvents.pinned++; } } + +/** + * Update allocation chart data from server response + */ +export function updateAllocationCharts(data) { + // Update allocation state + allocationData.totalAllocations = data.totalAllocations || 0; + allocationData.totalAllocatedMB = parseFloat(data.totalAllocatedMB) || 0; + allocationData.allocationRateMBPerSec = parseFloat(data.allocationRateMBPerSec) || 0; + allocationData.peakAllocationRateMBPerSec = parseFloat(data.peakAllocationRateMBPerSec) || 0; + allocationData.topAllocatingClasses = data.topAllocatingClasses || []; + + // Update history from server + if (data.history && data.history.length > 0) { + allocationData.history.labels.length = 0; + allocationData.history.rates.length = 0; + + data.history.forEach(snapshot => { + const time = new Date(snapshot.timestamp).toLocaleTimeString('en-US', { + hour12: false, + hour: '2-digit', + minute: '2-digit', + second: '2-digit' + }); + allocationData.history.labels.push(time); + allocationData.history.rates.push(parseFloat(snapshot.allocationRateMBPerSec) || 0); + }); + } + + if (allocationRateChart) allocationRateChart.update('none'); +} + +/** + * Update metaspace chart data from server response + */ +export function updateMetaspaceCharts(data) { + // Update metaspace state + metaspaceData.currentUsedMB = parseFloat(data.currentUsedMB) || 0; + metaspaceData.currentCommittedMB = parseFloat(data.currentCommittedMB) || 0; + metaspaceData.peakUsedMB = parseFloat(data.peakUsedMB) || 0; + metaspaceData.growthRateMBPerMin = parseFloat(data.growthRateMBPerMin) || 0; + metaspaceData.classCount = data.currentClassCount || 0; + + // Update history from server + if (data.history && data.history.length > 0) { + metaspaceData.history.labels.length = 0; + metaspaceData.history.used.length = 0; + metaspaceData.history.committed.length = 0; + + data.history.forEach(snapshot => { + const time = new Date(snapshot.timestamp).toLocaleTimeString('en-US', { + hour12: false, + hour: '2-digit', + minute: '2-digit', + second: '2-digit' + }); + metaspaceData.history.labels.push(time); + metaspaceData.history.used.push(parseFloat(snapshot.usedMB) || 0); + metaspaceData.history.committed.push(parseFloat(snapshot.committedMB) || 0); + }); + } + + if (metaspaceChart) metaspaceChart.update('none'); +} + +/** + * Update method profiling chart data from server response + */ +export function updateProfilingCharts(data) { + profilingData.totalSamples = data.totalSamples || 0; + profilingData.topMethods = data.topMethods || []; + + if (hotMethodsChart && data.topMethods && data.topMethods.length > 0) { + // Take top 10 methods + const top10 = data.topMethods.slice(0, 10); + + hotMethodsChart.data.labels = top10.map(m => { + const className = m.className.split('.').pop(); // Get simple class name + return className + '.' + m.methodName; + }); + hotMethodsChart.data.datasets[0].data = top10.map(m => m.sampleCount); + hotMethodsChart.update('none'); + } +} + +/** + * Update contention chart data from server response + */ +export function updateContentionCharts(data) { + contentionData.totalContentionEvents = data.totalContentionEvents || 0; + contentionData.totalContentionTimeMs = data.totalContentionTimeMs || 0; + contentionData.hotspots = data.hotspots || []; + + if (contentionChart && data.hotspots && data.hotspots.length > 0) { + // Take top 10 hotspots + const top10 = data.hotspots.slice(0, 10); + + contentionChart.data.labels = top10.map(h => { + const className = h.monitorClass.split('.').pop(); + return className; + }); + contentionChart.data.datasets[0].data = top10.map(h => h.totalTimeMs); + contentionChart.update('none'); + } +} diff --git a/argus-frontend/src/main/resources/public/js/state.js b/argus-frontend/src/main/resources/public/js/state.js index 1f2a493..7cd1e9d 100644 --- a/argus-frontend/src/main/resources/public/js/state.js +++ b/argus-frontend/src/main/resources/public/js/state.js @@ -81,6 +81,53 @@ export const cpuData = { } }; +// Allocation data +export const allocationData = { + totalAllocations: 0, + totalAllocatedMB: 0, + allocationRateMBPerSec: 0, + peakAllocationRateMBPerSec: 0, + topAllocatingClasses: [], + history: { + labels: [], + rates: [] + } +}; + +// Metaspace data +export const metaspaceData = { + currentUsedMB: 0, + currentCommittedMB: 0, + peakUsedMB: 0, + growthRateMBPerMin: 0, + classCount: 0, + history: { + labels: [], + used: [], + committed: [] + } +}; + +// Method profiling data +export const profilingData = { + totalSamples: 0, + topMethods: [] +}; + +// Contention data +export const contentionData = { + totalContentionEvents: 0, + totalContentionTimeMs: 0, + hotspots: [] +}; + +// Correlation data +export const correlationData = { + gcCpuCorrelations: [], + gcPinningCorrelations: [], + recommendations: [] +}; + // Per-second event counters for charts export let currentSecondEvents = { start: 0, end: 0, pinned: 0 }; export let lastSecondTimestamp = Math.floor(Date.now() / 1000); diff --git a/argus-server/src/main/java/io/argus/server/ArgusServer.java b/argus-server/src/main/java/io/argus/server/ArgusServer.java index 1e6447d..209acd7 100644 --- a/argus-server/src/main/java/io/argus/server/ArgusServer.java +++ b/argus-server/src/main/java/io/argus/server/ArgusServer.java @@ -1,12 +1,21 @@ package io.argus.server; import io.argus.core.buffer.RingBuffer; +import io.argus.core.event.AllocationEvent; +import io.argus.core.event.ContentionEvent; import io.argus.core.event.CPUEvent; +import io.argus.core.event.ExecutionSampleEvent; import io.argus.core.event.GCEvent; +import io.argus.core.event.MetaspaceEvent; import io.argus.core.event.VirtualThreadEvent; +import io.argus.server.analysis.AllocationAnalyzer; import io.argus.server.analysis.CarrierThreadAnalyzer; +import io.argus.server.analysis.ContentionAnalyzer; +import io.argus.server.analysis.CorrelationAnalyzer; import io.argus.server.analysis.CPUAnalyzer; import io.argus.server.analysis.GCAnalyzer; +import io.argus.server.analysis.MetaspaceAnalyzer; +import io.argus.server.analysis.MethodProfilingAnalyzer; import io.argus.server.analysis.PinningAnalyzer; import io.argus.server.handler.ArgusChannelHandler; import io.argus.server.metrics.ServerMetrics; @@ -60,6 +69,11 @@ public final class ArgusServer { private final RingBuffer eventBuffer; private final RingBuffer gcEventBuffer; private final RingBuffer cpuEventBuffer; + private final RingBuffer allocationEventBuffer; + private final RingBuffer metaspaceEventBuffer; + private final RingBuffer executionSampleEventBuffer; + private final RingBuffer contentionEventBuffer; + private final boolean correlationEnabled; private final AtomicBoolean running = new AtomicBoolean(false); // Components @@ -72,6 +86,11 @@ public final class ArgusServer { private final CarrierThreadAnalyzer carrierAnalyzer = new CarrierThreadAnalyzer(); private final GCAnalyzer gcAnalyzer = new GCAnalyzer(); private final CPUAnalyzer cpuAnalyzer = new CPUAnalyzer(); + private final AllocationAnalyzer allocationAnalyzer = new AllocationAnalyzer(); + private final MetaspaceAnalyzer metaspaceAnalyzer = new MetaspaceAnalyzer(); + private final MethodProfilingAnalyzer methodProfilingAnalyzer = new MethodProfilingAnalyzer(); + private final ContentionAnalyzer contentionAnalyzer = new ContentionAnalyzer(); + private CorrelationAnalyzer correlationAnalyzer; private final ThreadStateManager threadStateManager = new ThreadStateManager(); private final EventJsonSerializer serializer = new EventJsonSerializer(); private EventBroadcaster broadcaster; @@ -88,7 +107,7 @@ public final class ArgusServer { * @param eventBuffer the ring buffer to read events from */ public ArgusServer(int port, RingBuffer eventBuffer) { - this(port, eventBuffer, null, null); + this(port, eventBuffer, null, null, null, null, null, null, false); } /** @@ -101,10 +120,38 @@ public ArgusServer(int port, RingBuffer eventBuffer) { */ public ArgusServer(int port, RingBuffer eventBuffer, RingBuffer gcEventBuffer, RingBuffer cpuEventBuffer) { + this(port, eventBuffer, gcEventBuffer, cpuEventBuffer, null, null, null, null, false); + } + + /** + * Creates a new Argus server with all event buffers. + * + * @param port the port to listen on + * @param eventBuffer the ring buffer for virtual thread events + * @param gcEventBuffer the ring buffer for GC events (can be null) + * @param cpuEventBuffer the ring buffer for CPU events (can be null) + * @param allocationEventBuffer the ring buffer for allocation events (can be null) + * @param metaspaceEventBuffer the ring buffer for metaspace events (can be null) + * @param executionSampleEventBuffer the ring buffer for execution sample events (can be null) + * @param contentionEventBuffer the ring buffer for contention events (can be null) + * @param correlationEnabled whether correlation analysis is enabled + */ + public ArgusServer(int port, RingBuffer eventBuffer, + RingBuffer gcEventBuffer, RingBuffer cpuEventBuffer, + RingBuffer allocationEventBuffer, + RingBuffer metaspaceEventBuffer, + RingBuffer executionSampleEventBuffer, + RingBuffer contentionEventBuffer, + boolean correlationEnabled) { this.port = port; this.eventBuffer = eventBuffer; this.gcEventBuffer = gcEventBuffer; this.cpuEventBuffer = cpuEventBuffer; + this.allocationEventBuffer = allocationEventBuffer; + this.metaspaceEventBuffer = metaspaceEventBuffer; + this.executionSampleEventBuffer = executionSampleEventBuffer; + this.contentionEventBuffer = contentionEventBuffer; + this.correlationEnabled = correlationEnabled; } /** @@ -117,12 +164,20 @@ public void start() throws InterruptedException { throw new IllegalStateException("Server already running"); } - // Initialize broadcaster + // Initialize correlation analyzer if enabled + if (correlationEnabled) { + correlationAnalyzer = new CorrelationAnalyzer(); + } + + // Initialize broadcaster with all event buffers broadcaster = new EventBroadcaster( eventBuffer, gcEventBuffer, cpuEventBuffer, + allocationEventBuffer, metaspaceEventBuffer, + executionSampleEventBuffer, contentionEventBuffer, clients, metrics, activeThreads, recentEvents, threadEvents, pinningAnalyzer, carrierAnalyzer, gcAnalyzer, cpuAnalyzer, - threadStateManager, serializer); + allocationAnalyzer, metaspaceAnalyzer, methodProfilingAnalyzer, contentionAnalyzer, + correlationAnalyzer, threadStateManager, serializer); // Initialize Netty bossGroup = new NioEventLoopGroup(1); @@ -140,7 +195,13 @@ protected void initChannel(SocketChannel ch) { .addLast(new WebSocketServerCompressionHandler()) .addLast(new ArgusChannelHandler( clients, metrics, activeThreads, threadEvents, - gcAnalyzer, cpuAnalyzer, broadcaster)); + gcAnalyzer, cpuAnalyzer, + allocationEventBuffer != null ? allocationAnalyzer : null, + metaspaceEventBuffer != null ? metaspaceAnalyzer : null, + executionSampleEventBuffer != null ? methodProfilingAnalyzer : null, + contentionEventBuffer != null ? contentionAnalyzer : null, + correlationAnalyzer, + broadcaster)); } }) .option(ChannelOption.SO_BACKLOG, 128) diff --git a/argus-server/src/main/java/io/argus/server/analysis/AllocationAnalyzer.java b/argus-server/src/main/java/io/argus/server/analysis/AllocationAnalyzer.java new file mode 100644 index 0000000..613c18d --- /dev/null +++ b/argus-server/src/main/java/io/argus/server/analysis/AllocationAnalyzer.java @@ -0,0 +1,214 @@ +package io.argus.server.analysis; + +import io.argus.core.event.AllocationEvent; + +import java.time.Instant; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Analyzes object allocation events and provides statistics. + * + *

    Tracks allocation rates, top allocating classes, and allocation history. + */ +public final class AllocationAnalyzer { + + private static final int MAX_HISTORY_SIZE = 60; + private static final int TOP_CLASSES_LIMIT = 10; + + private final AtomicLong totalAllocations = new AtomicLong(0); + private final AtomicLong totalBytesAllocated = new AtomicLong(0); + private final Map classAllocationCounts = new ConcurrentHashMap<>(); + private final Map classAllocationBytes = new ConcurrentHashMap<>(); + private final List history = new CopyOnWriteArrayList<>(); + + // Rolling window for rate calculation + private volatile long windowStartTime = System.currentTimeMillis(); + private volatile long windowBytes = 0; + private volatile double peakAllocationRate = 0; + + /** + * Records an allocation event for analysis. + * + * @param event the allocation event to record + */ + public void recordAllocationEvent(AllocationEvent event) { + totalAllocations.incrementAndGet(); + totalBytesAllocated.addAndGet(event.allocationSize()); + + // Track by class + String className = event.className() != null ? event.className() : "Unknown"; + classAllocationCounts.computeIfAbsent(className, k -> new AtomicLong()).incrementAndGet(); + classAllocationBytes.computeIfAbsent(className, k -> new AtomicLong()) + .addAndGet(event.allocationSize()); + + // Update window for rate calculation + updateRateWindow(event.allocationSize()); + } + + private synchronized void updateRateWindow(long bytes) { + long currentTime = System.currentTimeMillis(); + windowBytes += bytes; + + // Calculate rate every second + if (currentTime - windowStartTime >= 1000) { + double rate = windowBytes / ((currentTime - windowStartTime) / 1000.0); + + // Track peak + if (rate > peakAllocationRate) { + peakAllocationRate = rate; + } + + // Add to history + AllocationSnapshot snapshot = new AllocationSnapshot( + Instant.now(), + totalAllocations.get(), + totalBytesAllocated.get(), + rate + ); + history.add(snapshot); + while (history.size() > MAX_HISTORY_SIZE) { + history.removeFirst(); + } + + // Reset window + windowStartTime = currentTime; + windowBytes = 0; + } + } + + /** + * Returns the allocation analysis results. + * + * @return the allocation analysis result + */ + public AllocationAnalysisResult getAnalysis() { + // Calculate current rate + long currentTime = System.currentTimeMillis(); + double currentRate = 0; + if (currentTime > windowStartTime) { + currentRate = windowBytes / ((currentTime - windowStartTime) / 1000.0); + } + + // Get top allocating classes + List topClasses = classAllocationBytes.entrySet().stream() + .sorted((a, b) -> Long.compare(b.getValue().get(), a.getValue().get())) + .limit(TOP_CLASSES_LIMIT) + .map(e -> new ClassAllocation( + e.getKey(), + classAllocationCounts.getOrDefault(e.getKey(), new AtomicLong()).get(), + e.getValue().get() + )) + .toList(); + + return new AllocationAnalysisResult( + totalAllocations.get(), + totalBytesAllocated.get(), + currentRate, + peakAllocationRate, + topClasses, + new ArrayList<>(history) + ); + } + + /** + * Returns the allocation history for charting. + * + * @return list of allocation snapshots + */ + public List getHistory() { + return new ArrayList<>(history); + } + + /** + * Returns the current allocation rate in bytes per second. + * + * @return current allocation rate + */ + public double getCurrentAllocationRate() { + long currentTime = System.currentTimeMillis(); + if (currentTime > windowStartTime) { + return windowBytes / ((currentTime - windowStartTime) / 1000.0); + } + return 0; + } + + /** + * Clears all recorded data. + */ + public void clear() { + totalAllocations.set(0); + totalBytesAllocated.set(0); + classAllocationCounts.clear(); + classAllocationBytes.clear(); + history.clear(); + windowStartTime = System.currentTimeMillis(); + windowBytes = 0; + peakAllocationRate = 0; + } + + /** + * Allocation by class. + */ + public record ClassAllocation( + String className, + long allocationCount, + long totalBytes + ) { + } + + /** + * Snapshot of allocation state at a point in time. + */ + public record AllocationSnapshot( + Instant timestamp, + long totalAllocations, + long totalBytes, + double allocationRateBytesPerSec + ) { + /** + * Returns the allocation rate in MB/s. + */ + public double allocationRateMBPerSec() { + return allocationRateBytesPerSec / (1024.0 * 1024.0); + } + } + + /** + * Result of allocation analysis. + */ + public record AllocationAnalysisResult( + long totalAllocations, + long totalBytesAllocated, + double allocationRateBytesPerSec, + double peakAllocationRate, + List topAllocatingClasses, + List history + ) { + /** + * Returns the total allocated in MB. + */ + public double totalAllocatedMB() { + return totalBytesAllocated / (1024.0 * 1024.0); + } + + /** + * Returns the allocation rate in MB/s. + */ + public double allocationRateMBPerSec() { + return allocationRateBytesPerSec / (1024.0 * 1024.0); + } + + /** + * Returns the peak allocation rate in MB/s. + */ + public double peakAllocationRateMBPerSec() { + return peakAllocationRate / (1024.0 * 1024.0); + } + } +} diff --git a/argus-server/src/main/java/io/argus/server/analysis/ContentionAnalyzer.java b/argus-server/src/main/java/io/argus/server/analysis/ContentionAnalyzer.java new file mode 100644 index 0000000..b32f9db --- /dev/null +++ b/argus-server/src/main/java/io/argus/server/analysis/ContentionAnalyzer.java @@ -0,0 +1,173 @@ +package io.argus.server.analysis; + +import io.argus.core.event.ContentionEvent; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Analyzes thread contention events and provides statistics. + * + *

    Tracks lock contention hotspots, duration, and thread-level statistics. + */ +public final class ContentionAnalyzer { + + private static final int TOP_HOTSPOTS_LIMIT = 10; + + private final AtomicLong totalContentionEvents = new AtomicLong(0); + private final AtomicLong totalContentionTimeNanos = new AtomicLong(0); + private final Map monitorStats = new ConcurrentHashMap<>(); + private final Map threadContentionTime = new ConcurrentHashMap<>(); + + /** + * Records a contention event for analysis. + * + * @param event the contention event to record + */ + public void recordContentionEvent(ContentionEvent event) { + totalContentionEvents.incrementAndGet(); + totalContentionTimeNanos.addAndGet(event.durationNanos()); + + // Track by monitor class + String monitorClass = event.monitorClass() != null ? event.monitorClass() : "Unknown"; + monitorStats.computeIfAbsent(monitorClass, k -> new ContentionStats()) + .record(event.durationNanos(), event.type()); + + // Track by thread + threadContentionTime.computeIfAbsent(event.threadId(), k -> new AtomicLong()) + .addAndGet(event.durationNanos()); + } + + /** + * Returns the contention analysis results. + * + * @return the contention analysis result + */ + public ContentionAnalysisResult getAnalysis() { + long totalEvents = totalContentionEvents.get(); + long totalTimeNanos = totalContentionTimeNanos.get(); + long totalTimeMs = totalTimeNanos / 1_000_000; + + // Get top hotspots + List hotspots = monitorStats.entrySet().stream() + .sorted((a, b) -> Long.compare(b.getValue().totalTimeNanos.get(), a.getValue().totalTimeNanos.get())) + .limit(TOP_HOTSPOTS_LIMIT) + .map(e -> { + ContentionStats stats = e.getValue(); + double percentage = totalTimeNanos > 0 + ? (stats.totalTimeNanos.get() * 100.0) / totalTimeNanos : 0; + return new ContentionHotspot( + e.getKey(), + stats.eventCount.get(), + stats.totalTimeNanos.get() / 1_000_000, + stats.enterCount.get(), + stats.waitCount.get(), + percentage + ); + }) + .toList(); + + // Build thread contention map + Map threadContention = new ConcurrentHashMap<>(); + threadContentionTime.forEach((threadId, time) -> + threadContention.put("Thread-" + threadId, time.get() / 1_000_000)); + + return new ContentionAnalysisResult( + totalEvents, + totalTimeMs, + hotspots, + threadContention + ); + } + + /** + * Returns the top contention hotspots. + * + * @param limit maximum number of hotspots to return + * @return list of contention hotspots + */ + public List getTopHotspots(int limit) { + long totalTimeNanos = totalContentionTimeNanos.get(); + + return monitorStats.entrySet().stream() + .sorted((a, b) -> Long.compare(b.getValue().totalTimeNanos.get(), a.getValue().totalTimeNanos.get())) + .limit(limit) + .map(e -> { + ContentionStats stats = e.getValue(); + double percentage = totalTimeNanos > 0 + ? (stats.totalTimeNanos.get() * 100.0) / totalTimeNanos : 0; + return new ContentionHotspot( + e.getKey(), + stats.eventCount.get(), + stats.totalTimeNanos.get() / 1_000_000, + stats.enterCount.get(), + stats.waitCount.get(), + percentage + ); + }) + .toList(); + } + + /** + * Clears all recorded data. + */ + public void clear() { + totalContentionEvents.set(0); + totalContentionTimeNanos.set(0); + monitorStats.clear(); + threadContentionTime.clear(); + } + + /** + * Internal statistics for a monitor class. + */ + private static class ContentionStats { + final AtomicLong eventCount = new AtomicLong(0); + final AtomicLong totalTimeNanos = new AtomicLong(0); + final AtomicLong enterCount = new AtomicLong(0); + final AtomicLong waitCount = new AtomicLong(0); + + void record(long durationNanos, ContentionEvent.ContentionType type) { + eventCount.incrementAndGet(); + totalTimeNanos.addAndGet(durationNanos); + if (type == ContentionEvent.ContentionType.ENTER) { + enterCount.incrementAndGet(); + } else { + waitCount.incrementAndGet(); + } + } + } + + /** + * A contention hotspot identified by analysis. + */ + public record ContentionHotspot( + String monitorClass, + long eventCount, + long totalTimeMs, + long enterCount, + long waitCount, + double percentage + ) { + /** + * Returns the average contention time in ms. + */ + public double avgTimeMs() { + return eventCount > 0 ? (double) totalTimeMs / eventCount : 0; + } + } + + /** + * Result of contention analysis. + */ + public record ContentionAnalysisResult( + long totalContentionEvents, + long totalContentionTimeMs, + List hotspots, + Map threadContentionTime + ) { + } +} diff --git a/argus-server/src/main/java/io/argus/server/analysis/CorrelationAnalyzer.java b/argus-server/src/main/java/io/argus/server/analysis/CorrelationAnalyzer.java new file mode 100644 index 0000000..1955002 --- /dev/null +++ b/argus-server/src/main/java/io/argus/server/analysis/CorrelationAnalyzer.java @@ -0,0 +1,278 @@ +package io.argus.server.analysis; + +import java.time.Duration; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; + +/** + * Analyzes correlations between different event types. + * + *

    Detects patterns such as: + *

      + *
    • GC ↔ CPU spike correlations
    • + *
    • GC ↔ Pinning correlations
    • + *
    • Automatic recommendations based on detected patterns
    • + *
    + */ +public final class CorrelationAnalyzer { + + private static final int MAX_CORRELATIONS = 50; + private static final Duration CORRELATION_WINDOW = Duration.ofSeconds(1); + private static final double CPU_SPIKE_THRESHOLD = 0.7; // 70% CPU + private static final double GC_OVERHEAD_WARNING_THRESHOLD = 0.10; // 10% + private static final double HIGH_ALLOCATION_RATE_THRESHOLD = 100 * 1024 * 1024; // 100 MB/s + + private final List gcCpuCorrelations = new CopyOnWriteArrayList<>(); + private final List gcPinningCorrelations = new CopyOnWriteArrayList<>(); + private final List recommendations = new CopyOnWriteArrayList<>(); + + // Tracking for correlation detection + private final List recentGCEvents = new CopyOnWriteArrayList<>(); + private final List recentCPUSpikes = new CopyOnWriteArrayList<>(); + private final List recentPinningEvents = new CopyOnWriteArrayList<>(); + + /** + * Records a GC event for correlation analysis. + * + * @param timestamp the GC event timestamp + * @param gcName the GC name + * @param pauseTimeMs the pause time in milliseconds + */ + public void recordGCEvent(Instant timestamp, String gcName, double pauseTimeMs) { + recentGCEvents.add(new GCTimestamp(timestamp, gcName, pauseTimeMs)); + + // Clean old events + cleanOldEvents(); + + // Check for correlations + checkGCCPUCorrelation(timestamp, gcName, pauseTimeMs); + checkGCPinningCorrelation(timestamp, gcName, pauseTimeMs); + } + + /** + * Records a CPU spike for correlation analysis. + * + * @param timestamp the spike timestamp + * @param cpuLoad the CPU load (0.0-1.0) + */ + public void recordCPUSpike(Instant timestamp, double cpuLoad) { + if (cpuLoad >= CPU_SPIKE_THRESHOLD) { + recentCPUSpikes.add(new CPUSpikeTimestamp(timestamp, cpuLoad)); + cleanOldEvents(); + } + } + + /** + * Records a pinning event for correlation analysis. + * + * @param timestamp the pinning timestamp + * @param threadName the thread name + */ + public void recordPinningEvent(Instant timestamp, String threadName) { + recentPinningEvents.add(new PinningTimestamp(timestamp, threadName)); + cleanOldEvents(); + } + + /** + * Updates recommendations based on current metrics. + * + * @param gcOverheadPercent current GC overhead percentage + * @param heapGrowthRateMB heap growth rate in MB/min + * @param allocationRateMBps allocation rate in MB/s + * @param contentionTimeMs total contention time in ms + * @param metaspaceGrowthMB metaspace growth rate in MB/min + */ + public void updateRecommendations(double gcOverheadPercent, double heapGrowthRateMB, + double allocationRateMBps, long contentionTimeMs, + double metaspaceGrowthMB) { + recommendations.clear(); + + // GC overhead warning + if (gcOverheadPercent > GC_OVERHEAD_WARNING_THRESHOLD * 100) { + recommendations.add(new Recommendation( + RecommendationType.GC_OVERHEAD_HIGH, + "High GC Overhead", + String.format("GC overhead is %.1f%%, exceeding the 10%% threshold. " + + "Consider increasing heap size or tuning GC parameters.", gcOverheadPercent), + Severity.WARNING + )); + } + + // Memory leak suspected + if (heapGrowthRateMB > 10) { // 10 MB/min growth + recommendations.add(new Recommendation( + RecommendationType.MEMORY_LEAK_SUSPECTED, + "Potential Memory Leak", + String.format("Heap is growing at %.1f MB/min. This may indicate a memory leak. " + + "Consider using heap dump analysis.", heapGrowthRateMB), + Severity.WARNING + )); + } + + // High allocation rate + if (allocationRateMBps > HIGH_ALLOCATION_RATE_THRESHOLD / (1024 * 1024)) { + recommendations.add(new Recommendation( + RecommendationType.ALLOCATION_RATE_HIGH, + "High Allocation Rate", + String.format("Allocation rate is %.1f MB/s. High allocation can cause frequent GC. " + + "Consider object pooling or reducing allocations.", allocationRateMBps), + Severity.INFO + )); + } + + // Contention hotspot + if (contentionTimeMs > 1000) { // More than 1 second of contention + recommendations.add(new Recommendation( + RecommendationType.CONTENTION_HOTSPOT, + "Lock Contention Detected", + String.format("Total lock contention time is %d ms. " + + "Review synchronized blocks and consider using concurrent alternatives.", contentionTimeMs), + Severity.WARNING + )); + } + + // Metaspace growth + if (metaspaceGrowthMB > 1) { // 1 MB/min growth + recommendations.add(new Recommendation( + RecommendationType.METASPACE_GROWTH, + "Metaspace Growing", + String.format("Metaspace is growing at %.2f MB/min. " + + "This may indicate class loader leaks or excessive dynamic class generation.", metaspaceGrowthMB), + Severity.INFO + )); + } + } + + /** + * Returns the correlation analysis results. + * + * @return the correlation result + */ + public CorrelationResult getAnalysis() { + return new CorrelationResult( + new ArrayList<>(gcCpuCorrelations), + new ArrayList<>(gcPinningCorrelations), + new ArrayList<>(recommendations) + ); + } + + /** + * Clears all recorded data. + */ + public void clear() { + gcCpuCorrelations.clear(); + gcPinningCorrelations.clear(); + recommendations.clear(); + recentGCEvents.clear(); + recentCPUSpikes.clear(); + recentPinningEvents.clear(); + } + + private void checkGCCPUCorrelation(Instant gcTimestamp, String gcName, double pauseTimeMs) { + // Look for CPU spikes within 1 second of GC event + for (CPUSpikeTimestamp spike : recentCPUSpikes) { + Duration diff = Duration.between(spike.timestamp, gcTimestamp).abs(); + if (diff.compareTo(CORRELATION_WINDOW) <= 0) { + CorrelatedEvent correlation = new CorrelatedEvent( + gcTimestamp, + "GC_PAUSE", + "CPU_SPIKE", + String.format("GC '%s' (%.1fms pause) occurred with CPU spike (%.1f%%)", + gcName, pauseTimeMs, spike.cpuLoad * 100) + ); + gcCpuCorrelations.add(correlation); + trimList(gcCpuCorrelations); + } + } + } + + private void checkGCPinningCorrelation(Instant gcTimestamp, String gcName, double pauseTimeMs) { + // Look for pinning events within 1 second of GC event + for (PinningTimestamp pinning : recentPinningEvents) { + Duration diff = Duration.between(pinning.timestamp, gcTimestamp).abs(); + if (diff.compareTo(CORRELATION_WINDOW) <= 0) { + CorrelatedEvent correlation = new CorrelatedEvent( + gcTimestamp, + "GC_PAUSE", + "PINNING", + String.format("GC '%s' occurred with pinned thread '%s'", + gcName, pinning.threadName) + ); + gcPinningCorrelations.add(correlation); + trimList(gcPinningCorrelations); + } + } + } + + private void cleanOldEvents() { + Instant cutoff = Instant.now().minus(Duration.ofSeconds(10)); + recentGCEvents.removeIf(e -> e.timestamp.isBefore(cutoff)); + recentCPUSpikes.removeIf(e -> e.timestamp.isBefore(cutoff)); + recentPinningEvents.removeIf(e -> e.timestamp.isBefore(cutoff)); + } + + private void trimList(List list) { + while (list.size() > MAX_CORRELATIONS) { + list.removeFirst(); + } + } + + // Internal timestamp tracking records + private record GCTimestamp(Instant timestamp, String gcName, double pauseTimeMs) {} + private record CPUSpikeTimestamp(Instant timestamp, double cpuLoad) {} + private record PinningTimestamp(Instant timestamp, String threadName) {} + + /** + * A correlated event between two event types. + */ + public record CorrelatedEvent( + Instant timestamp, + String primaryEvent, + String correlatedEvent, + String description + ) { + } + + /** + * Types of recommendations. + */ + public enum RecommendationType { + GC_OVERHEAD_HIGH, + MEMORY_LEAK_SUSPECTED, + CONTENTION_HOTSPOT, + ALLOCATION_RATE_HIGH, + METASPACE_GROWTH + } + + /** + * Severity levels for recommendations. + */ + public enum Severity { + INFO, + WARNING, + CRITICAL + } + + /** + * A recommendation based on detected patterns. + */ + public record Recommendation( + RecommendationType type, + String title, + String description, + Severity severity + ) { + } + + /** + * Result of correlation analysis. + */ + public record CorrelationResult( + List gcCpuCorrelations, + List gcPinningCorrelations, + List recommendations + ) { + } +} diff --git a/argus-server/src/main/java/io/argus/server/analysis/GCAnalyzer.java b/argus-server/src/main/java/io/argus/server/analysis/GCAnalyzer.java index 53ec263..57b2461 100644 --- a/argus-server/src/main/java/io/argus/server/analysis/GCAnalyzer.java +++ b/argus-server/src/main/java/io/argus/server/analysis/GCAnalyzer.java @@ -32,6 +32,11 @@ public final class GCAnalyzer { private volatile long lastHeapCommitted = 0; private volatile Instant lastGCTime = null; + // GC overhead tracking + private volatile long overheadWindowStartTime = System.currentTimeMillis(); + private volatile long overheadWindowPauseNanos = 0; + private volatile double currentGcOverheadPercent = 0; + /** * Records a GC event for analysis. * @@ -44,6 +49,9 @@ public void recordGCEvent(GCEvent event) { if (event.duration() > 0) { totalPauseTimeNanos.addAndGet(event.duration()); updateMax(maxPauseTimeNanos, event.duration()); + + // Update GC overhead calculation + updateGCOverhead(event.duration()); } // Track cause distribution @@ -78,6 +86,23 @@ public void recordGCEvent(GCEvent event) { } } + private synchronized void updateGCOverhead(long pauseNanos) { + long currentTime = System.currentTimeMillis(); + overheadWindowPauseNanos += pauseNanos; + + // Calculate overhead every 10 seconds + long windowDurationMs = currentTime - overheadWindowStartTime; + if (windowDurationMs >= 10000) { + // Convert window duration to nanos for calculation + long windowDurationNanos = windowDurationMs * 1_000_000L; + currentGcOverheadPercent = (overheadWindowPauseNanos * 100.0) / windowDurationNanos; + + // Reset window + overheadWindowStartTime = currentTime; + overheadWindowPauseNanos = 0; + } + } + /** * Returns the GC analysis results. * @@ -99,6 +124,9 @@ public GCAnalysisResult getAnalysis() { Map causes = new ConcurrentHashMap<>(); causeDistribution.forEach((cause, count) -> causes.put(cause, count.get())); + // GC overhead warning if > 10% + boolean overheadWarning = currentGcOverheadPercent > 10.0; + return new GCAnalysisResult( total, totalPause / 1_000_000, // Convert to ms @@ -108,10 +136,21 @@ public GCAnalysisResult getAnalysis() { causes, lastHeapUsed, lastHeapCommitted, - lastGCTime + lastGCTime, + currentGcOverheadPercent, + overheadWarning ); } + /** + * Returns the current GC overhead percentage. + * + * @return GC overhead as a percentage (0-100) + */ + public double getCurrentGcOverheadPercent() { + return currentGcOverheadPercent; + } + /** * Returns the recent GC events for charting. * @@ -155,6 +194,9 @@ public void clear() { lastHeapUsed = 0; lastHeapCommitted = 0; lastGCTime = null; + overheadWindowStartTime = System.currentTimeMillis(); + overheadWindowPauseNanos = 0; + currentGcOverheadPercent = 0; } private void updateMax(AtomicLong max, long value) { @@ -193,7 +235,9 @@ public record GCAnalysisResult( Map causeDistribution, long currentHeapUsed, long currentHeapCommitted, - Instant lastGCTime + Instant lastGCTime, + double gcOverheadPercent, + boolean isOverheadWarning ) { } } diff --git a/argus-server/src/main/java/io/argus/server/analysis/MetaspaceAnalyzer.java b/argus-server/src/main/java/io/argus/server/analysis/MetaspaceAnalyzer.java new file mode 100644 index 0000000..630d714 --- /dev/null +++ b/argus-server/src/main/java/io/argus/server/analysis/MetaspaceAnalyzer.java @@ -0,0 +1,215 @@ +package io.argus.server.analysis; + +import io.argus.core.event.MetaspaceEvent; + +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Analyzes metaspace events and provides statistics. + * + *

    Tracks metaspace usage, growth rate, and history. + */ +public final class MetaspaceAnalyzer { + + private static final int MAX_HISTORY_SIZE = 60; + + private final List history = new CopyOnWriteArrayList<>(); + private final AtomicLong totalEvents = new AtomicLong(0); + + // Current state + private volatile long currentUsed = 0; + private volatile long currentCommitted = 0; + private volatile long currentReserved = 0; + private volatile long currentClassCount = 0; + private volatile Instant lastUpdateTime = null; + + // Peak tracking + private volatile long peakUsed = 0; + + // Growth tracking + private volatile long initialUsed = -1; + private volatile Instant initialTime = null; + + /** + * Records a metaspace event for analysis. + * + * @param event the metaspace event to record + */ + public void recordMetaspaceEvent(MetaspaceEvent event) { + totalEvents.incrementAndGet(); + + // Track initial value for growth calculation + if (initialUsed < 0) { + initialUsed = event.metaspaceUsed(); + initialTime = event.timestamp(); + } + + // Update current state + currentUsed = event.metaspaceUsed(); + currentCommitted = event.metaspaceCommitted(); + currentReserved = event.metaspaceReserved(); + currentClassCount = event.classCount(); + lastUpdateTime = event.timestamp(); + + // Track peak + if (currentUsed > peakUsed) { + peakUsed = currentUsed; + } + + // Add to history + MetaspaceSnapshot snapshot = new MetaspaceSnapshot( + event.timestamp(), + event.metaspaceUsed(), + event.metaspaceCommitted(), + event.metaspaceReserved(), + event.classCount() + ); + + history.add(snapshot); + while (history.size() > MAX_HISTORY_SIZE) { + history.removeFirst(); + } + } + + /** + * Returns the metaspace analysis results. + * + * @return the metaspace analysis result + */ + public MetaspaceAnalysisResult getAnalysis() { + // Calculate growth rate (bytes per minute) + double growthRatePerMin = 0; + if (initialTime != null && lastUpdateTime != null && initialUsed >= 0) { + long durationMs = java.time.Duration.between(initialTime, lastUpdateTime).toMillis(); + if (durationMs > 0) { + double durationMinutes = durationMs / 60000.0; + growthRatePerMin = (currentUsed - initialUsed) / durationMinutes; + } + } + + return new MetaspaceAnalysisResult( + currentUsed, + currentCommitted, + currentReserved, + currentClassCount, + peakUsed, + growthRatePerMin, + new ArrayList<>(history), + lastUpdateTime + ); + } + + /** + * Returns the metaspace history for charting. + * + * @return list of metaspace snapshots + */ + public List getHistory() { + return new ArrayList<>(history); + } + + /** + * Returns the current metaspace used. + * + * @return current metaspace used in bytes + */ + public long getCurrentUsed() { + return currentUsed; + } + + /** + * Returns the current metaspace committed. + * + * @return current metaspace committed in bytes + */ + public long getCurrentCommitted() { + return currentCommitted; + } + + /** + * Clears all recorded data. + */ + public void clear() { + history.clear(); + totalEvents.set(0); + currentUsed = 0; + currentCommitted = 0; + currentReserved = 0; + currentClassCount = 0; + lastUpdateTime = null; + peakUsed = 0; + initialUsed = -1; + initialTime = null; + } + + /** + * Snapshot of metaspace state at a point in time. + */ + public record MetaspaceSnapshot( + Instant timestamp, + long used, + long committed, + long reserved, + long classCount + ) { + /** + * Returns the metaspace used in MB. + */ + public double usedMB() { + return used / (1024.0 * 1024.0); + } + + /** + * Returns the metaspace committed in MB. + */ + public double committedMB() { + return committed / (1024.0 * 1024.0); + } + } + + /** + * Result of metaspace analysis. + */ + public record MetaspaceAnalysisResult( + long currentUsed, + long currentCommitted, + long currentReserved, + long currentClassCount, + long peakUsed, + double growthRatePerMin, + List history, + Instant lastUpdateTime + ) { + /** + * Returns the current used in MB. + */ + public double currentUsedMB() { + return currentUsed / (1024.0 * 1024.0); + } + + /** + * Returns the current committed in MB. + */ + public double currentCommittedMB() { + return currentCommitted / (1024.0 * 1024.0); + } + + /** + * Returns the peak used in MB. + */ + public double peakUsedMB() { + return peakUsed / (1024.0 * 1024.0); + } + + /** + * Returns the growth rate in MB per minute. + */ + public double growthRateMBPerMin() { + return growthRatePerMin / (1024.0 * 1024.0); + } + } +} diff --git a/argus-server/src/main/java/io/argus/server/analysis/MethodProfilingAnalyzer.java b/argus-server/src/main/java/io/argus/server/analysis/MethodProfilingAnalyzer.java new file mode 100644 index 0000000..f1b87b7 --- /dev/null +++ b/argus-server/src/main/java/io/argus/server/analysis/MethodProfilingAnalyzer.java @@ -0,0 +1,141 @@ +package io.argus.server.analysis; + +import io.argus.core.event.ExecutionSampleEvent; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Analyzes execution sample events for CPU profiling. + * + *

    Tracks hot methods and provides method profiling statistics. + */ +public final class MethodProfilingAnalyzer { + + private static final int TOP_METHODS_LIMIT = 20; + + private final AtomicLong totalSamples = new AtomicLong(0); + private final Map methodSampleCounts = new ConcurrentHashMap<>(); + private final Map packageSampleCounts = new ConcurrentHashMap<>(); + + /** + * Records an execution sample event for analysis. + * + * @param event the execution sample event to record + */ + public void recordExecutionSample(ExecutionSampleEvent event) { + if (event == null) { + return; + } + + totalSamples.incrementAndGet(); + + // Track by fully qualified method name + String methodKey = event.fullyQualifiedMethod(); + methodSampleCounts.computeIfAbsent(methodKey, k -> new AtomicLong()).incrementAndGet(); + + // Track by package + String packageName = event.packageName(); + if (packageName != null && !packageName.isEmpty()) { + packageSampleCounts.computeIfAbsent(packageName, k -> new AtomicLong()).incrementAndGet(); + } + } + + /** + * Returns the method profiling analysis results. + * + * @return the profiling analysis result + */ + public MethodProfilingResult getAnalysis() { + long total = totalSamples.get(); + + // Get top methods + List topMethods = methodSampleCounts.entrySet().stream() + .sorted((a, b) -> Long.compare(b.getValue().get(), a.getValue().get())) + .limit(TOP_METHODS_LIMIT) + .map(e -> { + String fullMethod = e.getKey(); + long count = e.getValue().get(); + double percentage = total > 0 ? (count * 100.0) / total : 0; + + // Parse class and method name + int lastDot = fullMethod.lastIndexOf('.'); + String className = lastDot > 0 ? fullMethod.substring(0, lastDot) : "Unknown"; + String methodName = lastDot > 0 ? fullMethod.substring(lastDot + 1) : fullMethod; + + return new HotMethod(className, methodName, count, percentage); + }) + .toList(); + + // Build package distribution + Map packageDistribution = new ConcurrentHashMap<>(); + packageSampleCounts.forEach((pkg, count) -> packageDistribution.put(pkg, count.get())); + + return new MethodProfilingResult(total, topMethods, packageDistribution); + } + + /** + * Returns the top hot methods. + * + * @param limit maximum number of methods to return + * @return list of hot methods + */ + public List getTopMethods(int limit) { + long total = totalSamples.get(); + + return methodSampleCounts.entrySet().stream() + .sorted((a, b) -> Long.compare(b.getValue().get(), a.getValue().get())) + .limit(limit) + .map(e -> { + String fullMethod = e.getKey(); + long count = e.getValue().get(); + double percentage = total > 0 ? (count * 100.0) / total : 0; + + int lastDot = fullMethod.lastIndexOf('.'); + String className = lastDot > 0 ? fullMethod.substring(0, lastDot) : "Unknown"; + String methodName = lastDot > 0 ? fullMethod.substring(lastDot + 1) : fullMethod; + + return new HotMethod(className, methodName, count, percentage); + }) + .toList(); + } + + /** + * Clears all recorded data. + */ + public void clear() { + totalSamples.set(0); + methodSampleCounts.clear(); + packageSampleCounts.clear(); + } + + /** + * A hot method identified by CPU profiling. + */ + public record HotMethod( + String className, + String methodName, + long sampleCount, + double percentage + ) { + /** + * Returns the fully qualified method name. + */ + public String fullyQualifiedName() { + return className + "." + methodName; + } + } + + /** + * Result of method profiling analysis. + */ + public record MethodProfilingResult( + long totalSamples, + List topMethods, + Map packageDistribution + ) { + } +} diff --git a/argus-server/src/main/java/io/argus/server/handler/ArgusChannelHandler.java b/argus-server/src/main/java/io/argus/server/handler/ArgusChannelHandler.java index 00dee90..788cc44 100644 --- a/argus-server/src/main/java/io/argus/server/handler/ArgusChannelHandler.java +++ b/argus-server/src/main/java/io/argus/server/handler/ArgusChannelHandler.java @@ -2,8 +2,13 @@ import java.util.Map; +import io.argus.server.analysis.AllocationAnalyzer; +import io.argus.server.analysis.ContentionAnalyzer; +import io.argus.server.analysis.CorrelationAnalyzer; import io.argus.server.analysis.CPUAnalyzer; import io.argus.server.analysis.GCAnalyzer; +import io.argus.server.analysis.MetaspaceAnalyzer; +import io.argus.server.analysis.MethodProfilingAnalyzer; import io.argus.server.http.HttpResponseHelper; import io.argus.server.http.StaticFileHandler; import io.argus.server.metrics.ServerMetrics; @@ -36,13 +41,18 @@ public final class ArgusChannelHandler extends SimpleChannelInboundHandler allThreads = Thread.getAllStackTraces(); diff --git a/argus-server/src/main/java/io/argus/server/serialization/EventJsonSerializer.java b/argus-server/src/main/java/io/argus/server/serialization/EventJsonSerializer.java index d61e5db..9b41c7f 100644 --- a/argus-server/src/main/java/io/argus/server/serialization/EventJsonSerializer.java +++ b/argus-server/src/main/java/io/argus/server/serialization/EventJsonSerializer.java @@ -60,6 +60,10 @@ public String getShortTypeName(EventType eventType) { case GC_PAUSE -> "GC_PAUSE"; case GC_HEAP_SUMMARY -> "GC_HEAP_SUMMARY"; case CPU_LOAD -> "CPU_LOAD"; + case ALLOCATION -> "ALLOCATION"; + case METASPACE_SUMMARY -> "METASPACE_SUMMARY"; + case EXECUTION_SAMPLE -> "EXECUTION_SAMPLE"; + case CONTENTION -> "CONTENTION"; }; } diff --git a/argus-server/src/main/java/io/argus/server/websocket/EventBroadcaster.java b/argus-server/src/main/java/io/argus/server/websocket/EventBroadcaster.java index 6ed5f1f..3c3e479 100644 --- a/argus-server/src/main/java/io/argus/server/websocket/EventBroadcaster.java +++ b/argus-server/src/main/java/io/argus/server/websocket/EventBroadcaster.java @@ -1,12 +1,21 @@ package io.argus.server.websocket; import io.argus.core.buffer.RingBuffer; +import io.argus.core.event.AllocationEvent; +import io.argus.core.event.ContentionEvent; import io.argus.core.event.CPUEvent; +import io.argus.core.event.ExecutionSampleEvent; import io.argus.core.event.GCEvent; +import io.argus.core.event.MetaspaceEvent; import io.argus.core.event.VirtualThreadEvent; +import io.argus.server.analysis.AllocationAnalyzer; import io.argus.server.analysis.CarrierThreadAnalyzer; +import io.argus.server.analysis.ContentionAnalyzer; +import io.argus.server.analysis.CorrelationAnalyzer; import io.argus.server.analysis.CPUAnalyzer; import io.argus.server.analysis.GCAnalyzer; +import io.argus.server.analysis.MetaspaceAnalyzer; +import io.argus.server.analysis.MethodProfilingAnalyzer; import io.argus.server.analysis.PinningAnalyzer; import io.argus.server.metrics.ServerMetrics; import io.argus.server.serialization.EventJsonSerializer; @@ -41,6 +50,10 @@ public final class EventBroadcaster { private final RingBuffer eventBuffer; private final RingBuffer gcEventBuffer; private final RingBuffer cpuEventBuffer; + private final RingBuffer allocationEventBuffer; + private final RingBuffer metaspaceEventBuffer; + private final RingBuffer executionSampleEventBuffer; + private final RingBuffer contentionEventBuffer; private final ChannelGroup clients; private final List exportableEvents = Collections.synchronizedList(new ArrayList<>()); private final ServerMetrics metrics; @@ -51,33 +64,51 @@ public final class EventBroadcaster { private final CarrierThreadAnalyzer carrierAnalyzer; private final GCAnalyzer gcAnalyzer; private final CPUAnalyzer cpuAnalyzer; + private final AllocationAnalyzer allocationAnalyzer; + private final MetaspaceAnalyzer metaspaceAnalyzer; + private final MethodProfilingAnalyzer methodProfilingAnalyzer; + private final ContentionAnalyzer contentionAnalyzer; + private final CorrelationAnalyzer correlationAnalyzer; private final ThreadStateManager threadStateManager; private final EventJsonSerializer serializer; private final ScheduledExecutorService scheduler; private final ScheduledExecutorService stateScheduler; /** - * Creates an event broadcaster. + * Creates an event broadcaster with full event buffer support. * - * @param eventBuffer the ring buffer to drain virtual thread events from - * @param gcEventBuffer the ring buffer to drain GC events from (can be null) - * @param cpuEventBuffer the ring buffer to drain CPU events from (can be null) - * @param clients the channel group of connected WebSocket clients - * @param metrics the server metrics tracker - * @param activeThreads the active threads registry - * @param recentEvents the recent events buffer - * @param threadEvents the per-thread events buffer - * @param pinningAnalyzer the pinning analyzer for hotspot detection - * @param carrierAnalyzer the carrier thread analyzer - * @param gcAnalyzer the GC analyzer - * @param cpuAnalyzer the CPU analyzer - * @param threadStateManager the thread state manager for real-time state tracking - * @param serializer the event JSON serializer + * @param eventBuffer the ring buffer for virtual thread events + * @param gcEventBuffer the ring buffer for GC events (can be null) + * @param cpuEventBuffer the ring buffer for CPU events (can be null) + * @param allocationEventBuffer the ring buffer for allocation events (can be null) + * @param metaspaceEventBuffer the ring buffer for metaspace events (can be null) + * @param executionSampleEventBuffer the ring buffer for execution sample events (can be null) + * @param contentionEventBuffer the ring buffer for contention events (can be null) + * @param clients the channel group of connected WebSocket clients + * @param metrics the server metrics tracker + * @param activeThreads the active threads registry + * @param recentEvents the recent events buffer + * @param threadEvents the per-thread events buffer + * @param pinningAnalyzer the pinning analyzer for hotspot detection + * @param carrierAnalyzer the carrier thread analyzer + * @param gcAnalyzer the GC analyzer + * @param cpuAnalyzer the CPU analyzer + * @param allocationAnalyzer the allocation analyzer (can be null) + * @param metaspaceAnalyzer the metaspace analyzer (can be null) + * @param methodProfilingAnalyzer the method profiling analyzer (can be null) + * @param contentionAnalyzer the contention analyzer (can be null) + * @param correlationAnalyzer the correlation analyzer (can be null) + * @param threadStateManager the thread state manager for real-time state tracking + * @param serializer the event JSON serializer */ public EventBroadcaster( RingBuffer eventBuffer, RingBuffer gcEventBuffer, RingBuffer cpuEventBuffer, + RingBuffer allocationEventBuffer, + RingBuffer metaspaceEventBuffer, + RingBuffer executionSampleEventBuffer, + RingBuffer contentionEventBuffer, ChannelGroup clients, ServerMetrics metrics, ActiveThreadsRegistry activeThreads, @@ -87,11 +118,20 @@ public EventBroadcaster( CarrierThreadAnalyzer carrierAnalyzer, GCAnalyzer gcAnalyzer, CPUAnalyzer cpuAnalyzer, + AllocationAnalyzer allocationAnalyzer, + MetaspaceAnalyzer metaspaceAnalyzer, + MethodProfilingAnalyzer methodProfilingAnalyzer, + ContentionAnalyzer contentionAnalyzer, + CorrelationAnalyzer correlationAnalyzer, ThreadStateManager threadStateManager, EventJsonSerializer serializer) { this.eventBuffer = eventBuffer; this.gcEventBuffer = gcEventBuffer; this.cpuEventBuffer = cpuEventBuffer; + this.allocationEventBuffer = allocationEventBuffer; + this.metaspaceEventBuffer = metaspaceEventBuffer; + this.executionSampleEventBuffer = executionSampleEventBuffer; + this.contentionEventBuffer = contentionEventBuffer; this.clients = clients; this.metrics = metrics; this.activeThreads = activeThreads; @@ -101,6 +141,11 @@ public EventBroadcaster( this.carrierAnalyzer = carrierAnalyzer; this.gcAnalyzer = gcAnalyzer; this.cpuAnalyzer = cpuAnalyzer; + this.allocationAnalyzer = allocationAnalyzer; + this.metaspaceAnalyzer = metaspaceAnalyzer; + this.methodProfilingAnalyzer = methodProfilingAnalyzer; + this.contentionAnalyzer = contentionAnalyzer; + this.correlationAnalyzer = correlationAnalyzer; this.threadStateManager = threadStateManager; this.serializer = serializer; this.scheduler = Executors.newSingleThreadScheduledExecutor( @@ -231,6 +276,34 @@ private void drainAndBroadcast() { } }); } + + // Drain allocation events + if (allocationEventBuffer != null && allocationAnalyzer != null) { + allocationEventBuffer.drain(event -> { + allocationAnalyzer.recordAllocationEvent(event); + }); + } + + // Drain metaspace events + if (metaspaceEventBuffer != null && metaspaceAnalyzer != null) { + metaspaceEventBuffer.drain(event -> { + metaspaceAnalyzer.recordMetaspaceEvent(event); + }); + } + + // Drain execution sample events + if (executionSampleEventBuffer != null && methodProfilingAnalyzer != null) { + executionSampleEventBuffer.drain(event -> { + methodProfilingAnalyzer.recordExecutionSample(event); + }); + } + + // Drain contention events + if (contentionEventBuffer != null && contentionAnalyzer != null) { + contentionEventBuffer.drain(event -> { + contentionAnalyzer.recordContentionEvent(event); + }); + } } /** diff --git a/docs/architecture.md b/docs/architecture.md index 5191611..a9dcc20 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -53,11 +53,18 @@ The core module contains shared components used by other modules. **EventType.java** ``` -Enum defining virtual thread event types: +Enum defining event types: ├── VIRTUAL_THREAD_START (1) ├── VIRTUAL_THREAD_END (2) ├── VIRTUAL_THREAD_PINNED (3) -└── VIRTUAL_THREAD_SUBMIT_FAILED (4) +├── VIRTUAL_THREAD_SUBMIT_FAILED (4) +├── GC_PAUSE (10) +├── GC_HEAP_SUMMARY (11) +├── CPU_LOAD (20) +├── ALLOCATION (30) +├── METASPACE_SUMMARY (31) +├── EXECUTION_SAMPLE (40) +└── CONTENTION (41) ``` **VirtualThreadEvent.java** @@ -99,17 +106,36 @@ ArgusAgent.java JfrStreamingEngine.java ├── Uses RecordingStream (JDK 14+) ├── Subscribes to JFR events: -│ ├── jdk.VirtualThreadStart -│ ├── jdk.VirtualThreadEnd -│ ├── jdk.VirtualThreadPinned (with stack trace) -│ └── jdk.VirtualThreadSubmitFailed -├── Converts events to VirtualThreadEvent +│ ├── Virtual Thread Events: +│ │ ├── jdk.VirtualThreadStart +│ │ ├── jdk.VirtualThreadEnd +│ │ ├── jdk.VirtualThreadPinned (with stack trace) +│ │ └── jdk.VirtualThreadSubmitFailed +│ ├── GC & Memory Events: +│ │ ├── jdk.GarbageCollection +│ │ ├── jdk.GCHeapSummary +│ │ ├── jdk.ObjectAllocationInNewTLAB +│ │ └── jdk.MetaspaceSummary +│ └── CPU & Performance Events: +│ ├── jdk.CPULoad +│ ├── jdk.ExecutionSample +│ ├── jdk.JavaMonitorEnter +│ └── jdk.JavaMonitorWait +├── Event Extractors: +│ ├── VirtualThreadEventExtractor +│ ├── GCEventExtractor +│ ├── CPUEventExtractor +│ ├── AllocationEventExtractor +│ ├── MetaspaceEventExtractor +│ ├── ExecutionSampleExtractor +│ └── ContentionEventExtractor +├── Converts events to typed event records └── Offers events to RingBuffer ``` ### argus-server -The server module provides a WebSocket interface for event streaming. +The server module provides a WebSocket interface for event streaming and analysis. #### Components @@ -118,9 +144,26 @@ ArgusServer.java ├── Netty-based HTTP/WebSocket server ├── Endpoints: │ ├── ws://host:port/events - WebSocket stream -│ └── GET /health - Health check +│ ├── GET /health - Health check +│ ├── GET /metrics - Thread metrics +│ ├── GET /gc-analysis - GC statistics +│ ├── GET /cpu-metrics - CPU utilization +│ ├── GET /pinning-analysis - Pinning hotspots +│ ├── GET /allocation-analysis - Allocation metrics +│ ├── GET /metaspace-metrics - Metaspace usage +│ ├── GET /method-profiling - Hot methods +│ ├── GET /contention-analysis - Lock contention +│ └── GET /correlation - Correlation & recommendations ├── Event broadcaster (10ms interval) └── JSON serialization + +Analyzers: +├── GCAnalyzer - GC pause, heap, overhead analysis +├── AllocationAnalyzer - Allocation rate, top classes +├── MetaspaceAnalyzer - Metaspace usage tracking +├── MethodProfilingAnalyzer - Hot method detection +├── ContentionAnalyzer - Lock contention hotspots +└── CorrelationAnalyzer - Cross-metric correlation ``` ## Data Flow diff --git a/docs/benchmark-report.md b/docs/benchmark-report.md index 0ae4400..c135060 100644 --- a/docs/benchmark-report.md +++ b/docs/benchmark-report.md @@ -53,7 +53,7 @@ This document presents the performance overhead measurements of Argus Virtual Th **GC Overhead: None observed** -## Summary +## Summary (Basic Monitoring) | Metric | Overhead | |--------|----------| @@ -62,10 +62,63 @@ This document presents the performance overhead measurements of Argus Virtual Th | Latency | No significant impact | | GC | No additional GC pressure | +## Advanced Profiling Features (Phase 3-5) + +The following features are **disabled by default** due to higher overhead: + +### Feature Overhead Comparison + +| Feature | Event Frequency | Overhead Level | Default | +|---------|----------------|----------------|---------| +| GC Monitoring | Low (few per min) | **Very Low** | `true` | +| CPU Monitoring | Low (1/sec) | **Very Low** | `true` | +| Metaspace Monitoring | Low (at GC) | **Very Low** | `true` | +| Allocation Tracking | High (millions/sec) | **High** | `false` | +| Method Profiling | Medium (50/sec) | **Medium-High** | `false` | +| Lock Contention | Variable | **Variable** | `false` | + +### Why High-Overhead Features are Disabled + +1. **Allocation Tracking** (`argus.allocation.enabled`) + - JFR events: `jdk.ObjectAllocationInNewTLAB`, `jdk.ObjectAllocationOutsideTLAB` + - Problem: Millions of objects allocated per second → millions of events + - Mitigation: Use threshold ≥ 1MB to track only large allocations + +2. **Method Profiling** (`argus.profiling.enabled`) + - JFR event: `jdk.ExecutionSample` + - Problem: Periodic stack trace capture of all threads at safepoints + - Mitigation: Increase interval (e.g., 50-100ms) + +3. **Lock Contention** (`argus.contention.enabled`) + - JFR events: `jdk.JavaMonitorEnter`, `jdk.JavaMonitorWait` + - Problem: High-concurrency apps may generate many contention events + - Mitigation: Use threshold ≥ 50ms to track only significant contention + +### Recommended Configurations + +**Production (safe defaults):** +```bash +-Dargus.gc.enabled=true +-Dargus.cpu.enabled=true +-Dargus.metaspace.enabled=true +``` + +**Development/Testing (full profiling):** +```bash +-Dargus.allocation.enabled=true +-Dargus.allocation.threshold=1048576 +-Dargus.profiling.enabled=true +-Dargus.profiling.interval=50 +-Dargus.contention.enabled=true +-Dargus.contention.threshold=20 +``` + ## Conclusion Argus introduces approximately **9% throughput overhead** when profiling virtual thread events via JFR streaming. Memory overhead is minimal at **3.6 MB additional heap usage per 10,000 virtual threads**, representing less than 1% of the allocated heap. There is no measurable impact on latency or garbage collection behavior. +**Note:** Advanced profiling features (allocation tracking, method profiling, contention tracking) are disabled by default due to potentially high event volume. Enable them only when needed for debugging or optimization, preferably for short durations. + These overhead levels are acceptable for development and staging environments. For production use, consider the throughput trade-off based on your application's performance requirements. ## How to Run diff --git a/docs/configuration.md b/docs/configuration.md index a46154f..15b5b6a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -8,14 +8,35 @@ The Argus agent is configured via Java system properties. ### Available Properties +#### Core Settings | Property | Default | Description | |----------|---------|-------------| | `argus.server.enabled` | `false` | Enable built-in dashboard server | | `argus.server.port` | `9202` | WebSocket server port | | `argus.buffer.size` | `65536` | Ring buffer size for event collection | + +#### GC & Memory Settings +| Property | Default | Description | +|----------|---------|-------------| | `argus.gc.enabled` | `true` | Enable GC monitoring | +| `argus.allocation.enabled` | `false` | Enable allocation rate tracking (high overhead) | +| `argus.allocation.threshold` | `1048576` | Minimum allocation size to track (1MB) | +| `argus.metaspace.enabled` | `true` | Enable metaspace monitoring | + +#### CPU & Performance Settings +| Property | Default | Description | +|----------|---------|-------------| | `argus.cpu.enabled` | `true` | Enable CPU monitoring | | `argus.cpu.interval` | `1000` | CPU sampling interval (ms) | +| `argus.profiling.enabled` | `false` | Enable method profiling (high overhead) | +| `argus.profiling.interval` | `20` | Method profiling sampling interval (ms) | +| `argus.contention.enabled` | `false` | Enable lock contention tracking | +| `argus.contention.threshold` | `50` | Minimum contention duration to track (ms) | + +#### Analysis Settings +| Property | Default | Description | +|----------|---------|-------------| +| `argus.correlation.enabled` | `true` | Enable correlation analysis | ### Setting Properties @@ -111,6 +132,140 @@ java -javaagent:argus-agent.jar \ -Dargus.cpu.interval=2000 ``` +## Allocation Tracking Configuration + +Allocation tracking monitors object allocation rate and identifies top allocating classes. + +```bash +# Configure allocation tracking +java -javaagent:argus-agent.jar \ + -Dargus.allocation.enabled=true \ + -Dargus.allocation.threshold=1024 \ + --enable-preview \ + -jar your-application.jar +``` + +### Allocation Metrics Available + +- Total allocation count +- Total bytes allocated +- Allocation rate (MB/sec) +- Peak allocation rate +- Top 10 allocating classes + +### Tuning Allocation Threshold + +```bash +# Track all allocations >= 512 bytes +-Dargus.allocation.threshold=512 + +# Track only large allocations >= 8KB +-Dargus.allocation.threshold=8192 +``` + +## Metaspace Monitoring Configuration + +Metaspace monitoring tracks class metadata memory usage. + +```bash +# Enable/disable metaspace monitoring +java -javaagent:argus-agent.jar \ + -Dargus.metaspace.enabled=true \ + --enable-preview \ + -jar your-application.jar +``` + +### Metaspace Metrics Available + +- Current used/committed memory +- Peak usage +- Growth rate (MB/min) +- Class count + +## Method Profiling Configuration + +Method profiling identifies CPU-intensive methods using execution sampling. + +**Warning**: Method profiling has higher overhead. Use with caution in production. + +```bash +# Enable method profiling +java -javaagent:argus-agent.jar \ + -Dargus.profiling.enabled=true \ + -Dargus.profiling.interval=20 \ + --enable-preview \ + -jar your-application.jar +``` + +### Method Profiling Metrics Available + +- Total sample count +- Top 20 hot methods +- Method sample percentage + +### Adjusting Profiling Interval + +```bash +# More frequent sampling (higher accuracy, higher overhead) +-Dargus.profiling.interval=10 + +# Less frequent sampling (lower accuracy, lower overhead) +-Dargus.profiling.interval=50 +``` + +## Lock Contention Configuration + +Lock contention tracking monitors thread synchronization bottlenecks. + +```bash +# Configure contention tracking +java -javaagent:argus-agent.jar \ + -Dargus.contention.enabled=true \ + -Dargus.contention.threshold=10 \ + --enable-preview \ + -jar your-application.jar +``` + +### Contention Metrics Available + +- Total contention events +- Total contention time +- Top 10 contention hotspots +- Per-thread contention time + +### Tuning Contention Threshold + +```bash +# Track contention >= 5ms +-Dargus.contention.threshold=5 + +# Track only severe contention >= 50ms +-Dargus.contention.threshold=50 +``` + +## Correlation Analysis Configuration + +Correlation analysis detects relationships between different metrics. + +```bash +# Enable/disable correlation analysis +java -javaagent:argus-agent.jar \ + -Dargus.correlation.enabled=true \ + --enable-preview \ + -jar your-application.jar +``` + +### Correlation Features + +- **GC ↔ CPU Correlation**: Detects CPU spikes within 1 second of GC events +- **GC ↔ Pinning Correlation**: Identifies pinning increases during GC +- **Automatic Recommendations**: Provides actionable insights: + - GC overhead warnings (> 10%) + - Memory leak detection (sustained heap growth) + - Lock contention hotspot alerts + - High allocation rate warnings + - Metaspace growth warnings + ## JFR Event Configuration Argus captures the following JFR events by default: @@ -124,18 +279,23 @@ Argus captures the following JFR events by default: | `jdk.VirtualThreadPinned` | Thread pinning (with stack trace) | Medium | | `jdk.VirtualThreadSubmitFailed` | Submit failures | Low | -### GC Events +### GC & Memory Events | Event | Description | Overhead | |-------|-------------|----------| | `jdk.GarbageCollection` | GC pause events | Low | | `jdk.GCHeapSummary` | Heap usage snapshots | Low | +| `jdk.ObjectAllocationInNewTLAB` | Object allocation in TLAB | Medium | +| `jdk.MetaspaceSummary` | Metaspace usage | Low | -### CPU Events +### CPU & Performance Events | Event | Description | Overhead | |-------|-------------|----------| | `jdk.CPULoad` | CPU utilization (periodic) | Low | +| `jdk.ExecutionSample` | Method execution sampling | Medium-High | +| `jdk.JavaMonitorEnter` | Lock acquisition contention | Low | +| `jdk.JavaMonitorWait` | Lock wait contention | Low | ### JFR Settings diff --git a/samples/virtual-thread-simulation/build.gradle.kts b/samples/virtual-thread-simulation/build.gradle.kts index 2f77af8..c6428bb 100644 --- a/samples/virtual-thread-simulation/build.gradle.kts +++ b/samples/virtual-thread-simulation/build.gradle.kts @@ -77,7 +77,8 @@ tasks.register("runMetricsDemo") { jvmArgs( "--enable-preview", - "-Xmx128m", // Small heap to trigger GC frequently + "-Xmx512m", // Enough heap for JFR + Netty + app + "-Xms256m", "-XX:+UseG1GC", "-javaagent:${rootProject.projectDir}/argus-agent/build/libs/argus-agent-${rootProject.property("argusVersion")}.jar", "-Dargus.server.enabled=true", @@ -90,3 +91,44 @@ tasks.register("runMetricsDemo") { jvmArgs("-Dduration=$duration") } } + +// Run metrics demo with ALL features enabled (including high-overhead ones) +tasks.register("runMetricsDemoFull") { + group = "application" + description = "Run metrics demo with ALL profiling features enabled (high overhead)" + + mainClass.set("io.argus.sample.MetricsDemo") + classpath = sourceSets["main"].runtimeClasspath + + javaLauncher.set(javaToolchains.launcherFor { + languageVersion.set(JavaLanguageVersion.of(21)) + }) + + val duration = System.getProperty("duration") + + jvmArgs( + "--enable-preview", + "-Xmx1g", // More heap for full profiling + "-Xms512m", + "-XX:+UseG1GC", + "-javaagent:${rootProject.projectDir}/argus-agent/build/libs/argus-agent-${rootProject.property("argusVersion")}.jar", + "-Dargus.server.enabled=true", + "-Dargus.server.port=9202", + // Core features + "-Dargus.gc.enabled=true", + "-Dargus.cpu.enabled=true", + "-Dargus.metaspace.enabled=true", + // High-overhead features (opt-in) + "-Dargus.allocation.enabled=true", + "-Dargus.allocation.threshold=1048576", // 1MB threshold + "-Dargus.profiling.enabled=true", + "-Dargus.profiling.interval=50", // 50ms interval (lower overhead) + "-Dargus.contention.enabled=true", + "-Dargus.contention.threshold=20", // 20ms threshold + "-Dargus.correlation.enabled=true" + ) + + if (duration != null) { + jvmArgs("-Dduration=$duration") + } +}