refactor: migrate to structured event models and improve syscall parsing

scc-tw · scc-tw · commit 2f27fdd108a0 · 2025-05-02T13:27:25.000+08:00
- Replace raw string event handling with Pydantic models for type safety
- Add support for multiple syscall types (execve, fork, clone, connect)
- Create base event model with common validation
- Move event models to dedicated domain module
- Update tests to use new event models
- Improve trace reader to parse and validate events
- Remove deprecated event_models.py
diff --git a/linux_edr/app.py b/linux_edr/app.py
@@ -2,7 +2,7 @@
 from collections import defaultdict
 import re
 import os
-from typing import Dict, List, Optional, Any, NamedTuple, Iterator, Set, Tuple
+from typing import Dict, List, Optional, Any, NamedTuple, Iterator, Set, Tuple, Union
 from apscheduler.schedulers.background import BackgroundScheduler
 from .trace import TraceReader
 from .aggregator import Aggregator
@@ -11,6 +11,7 @@
 from .config import Config
 from .report_manager import ReportManager
 from .models import Cell
+from .domain.models.events import BaseSyscallEvent, ExecveEvent
 
 
 def setup_logging(debug: bool = False) -> None:
@@ -316,62 +317,35 @@ def _summarize(self) -> None:
 
         logging.info(f"Created cell report {cell.report_id} with {cell.total} events")
 
-    def _process_event(self, evt: str) -> None:
+    def _process_event(self, evt: BaseSyscallEvent) -> None:
         """
         Process a single event from the trace reader.
 
         Args:
             evt: Raw event string from trace_pipe
         """
-        # Log raw event in debug mode
-        if self.debug:
-            # Always log basic event info
-            logging.debug(f"Raw event: {evt}")
-
-            # Log detailed parsed info if verbose debug is enabled
-            if self.verbose_debug:
-                self._log_parsed_event(evt)
-
-        # If event is already a dict (e.g., when injected by tests or future extensions),
-        # we assume it's been validated and directly buffer it.
-        if isinstance(evt, dict):
-            self.agg.add(evt)
-            return
-
-        # Otherwise, treat it as raw text from trace_pipe and try to parse/validate.
-        parsed = parse_execve(evt)
+        if self.verbose_debug:
+            self._log_debug_event(evt)
 
-        if not parsed:
-            # Not an execve line – skip buffering
+        # If the trace reader already produced a validated ExecveEvent model, buffer it directly.
+        if isinstance(evt, BaseSyscallEvent):
+            self.agg.add(evt.model_dump() if hasattr(evt, "model_dump") else evt.dict())
             return
+        else:
+            logging.warning(f"Invalid event type: {type(evt)}")
 
-        # Validate with Pydantic schema (ensures correct types/structure)
-        try:
-            from .domain.models.event_models import ExecveEvent as ExecveEventModel  # Local import to avoid cycles
-
-            model_event = ExecveEventModel.from_namedtuple(parsed)
-
-            # Buffer as plain dict (safer for serialization & downstream processing)
-            self.agg.add(model_event.model_dump())
-        except Exception as e:
-            # Any validation or conversion error – log and drop the event
-            logging.warning("Invalid event skipped: %s", e)
-
-    def _log_parsed_event(self, evt: str) -> None:
+    def _log_debug_event(self, evt: BaseSyscallEvent) -> None:
         """
         Parse and log detailed event information.
 
         Args:
             evt: Raw event string from trace_pipe
         """
+        if not self.debug or not self.verbose_debug:
+            return
+
         try:
-            parsed_evt = parse_execve(evt)
-            if parsed_evt:
-                logging.debug(
-                    f"Parsed execve: timestamp={parsed_evt.timestamp}, "
-                    f"pid={parsed_evt.pid}, command={parsed_evt.command}, "
-                    f"args={parsed_evt.args}"
-                )
+            logging.debug(f"{evt}")
         except Exception as e:
             logging.debug(f"Parse error: {str(e)}")
 
diff --git a/linux_edr/domain/models/event_models.py b/linux_edr/domain/models/event_models.py
diff --git a/linux_edr/domain/models/events/__init__.py b/linux_edr/domain/models/events/__init__.py
@@ -0,0 +1,13 @@
+from .base import BaseSyscallEvent
+from .execve import ExecveEvent
+from .fork import ForkEvent
+from .clone import CloneEvent
+from .connect import ConnectEvent
+
+__all__ = [
+    "BaseSyscallEvent",
+    "ExecveEvent",
+    "ForkEvent",
+    "CloneEvent",
+    "ConnectEvent",
+] 
diff --git a/linux_edr/domain/models/events/base.py b/linux_edr/domain/models/events/base.py
@@ -0,0 +1,28 @@
+from pydantic import BaseModel, Field, field_validator
+from datetime import datetime
+
+class BaseSyscallEvent(BaseModel):
+    """Common attributes for all syscall events."""
+
+    timestamp: str = Field(..., description="Kernel timestamp (can be converted to datetime later)")
+    pid: int = Field(..., ge=0, description="Process ID that triggered the syscall")
+
+    # --- validators -------------------------------------------------------
+    @field_validator("timestamp")
+    @classmethod
+    def _validate_iso_or_numeric(cls, v: str) -> str:  # pragma: no cover
+        """Accepts isoformat or numeric timestamps but ensures non-empty."""
+        if not v:
+            raise ValueError("timestamp cannot be empty")
+        return v
+
+    @field_validator("pid")
+    @classmethod
+    def _validate_pid(cls, v: int) -> int:  # pragma: no cover
+        if v < 0:
+            raise ValueError("pid must be non-negative")
+        return v
+
+    # --- helpers ----------------------------------------------------------
+    def __str__(self) -> str:  # pragma: no cover – convenience only
+        return f"[{self.timestamp}] pid={self.pid}" 
diff --git a/linux_edr/domain/models/events/clone.py b/linux_edr/domain/models/events/clone.py
@@ -0,0 +1,11 @@
+from pydantic import Field
+from .base import BaseSyscallEvent
+
+class CloneEvent(BaseSyscallEvent):
+    """clone syscall event."""
+
+    child_pid: int = Field(..., ge=0, description="PID of the cloned task")
+    flags: str = Field(..., description="Clone flags")
+
+    def __str__(self) -> str:  # pragma: no cover
+        return f"{super().__str__()} clone -> child_pid={self.child_pid} flags={self.flags}" 
diff --git a/linux_edr/domain/models/events/connect.py b/linux_edr/domain/models/events/connect.py
@@ -0,0 +1,11 @@
+from pydantic import Field
+from .base import BaseSyscallEvent
+
+class ConnectEvent(BaseSyscallEvent):
+    """connect syscall event."""
+
+    fd: int = Field(..., ge=0, description="Socket file descriptor")
+    address: str = Field(..., description="Destination address (ip:port or path)")
+
+    def __str__(self) -> str:  # pragma: no cover
+        return f"{super().__str__()} connect -> fd={self.fd} addr={self.address}" 
diff --git a/linux_edr/domain/models/events/execve.py b/linux_edr/domain/models/events/execve.py
@@ -0,0 +1,14 @@
+from typing import List
+from pydantic import Field
+
+from .base import BaseSyscallEvent
+
+class ExecveEvent(BaseSyscallEvent):
+    """execve syscall event."""
+
+    command: str = Field(..., description="Executable invoked (basename)")
+    args: List[str] = Field(default_factory=list, description="Arguments supplied to the executable")
+
+    def __str__(self) -> str:  # pragma: no cover
+        cmd_line = " ".join([self.command, *self.args])
+        return f"{super().__str__()} execve -> {cmd_line}" 
diff --git a/linux_edr/domain/models/events/fork.py b/linux_edr/domain/models/events/fork.py
@@ -0,0 +1,10 @@
+from pydantic import Field
+from .base import BaseSyscallEvent
+
+class ForkEvent(BaseSyscallEvent):
+    """fork syscall event."""
+
+    child_pid: int = Field(..., ge=0, description="Child process PID created by fork")
+
+    def __str__(self) -> str:  # pragma: no cover
+        return f"{super().__str__()} fork -> child_pid={self.child_pid}" 
diff --git a/linux_edr/trace.py b/linux_edr/trace.py
@@ -4,15 +4,23 @@
 import errno
 import logging
 import time
-from typing import Generator, Optional
+import re
+from typing import Generator, Optional, Union
 
 # Default path to the kernel's trace_pipe
 TRACE_PATH = "/sys/kernel/tracing/trace_pipe"
 # Maximum time to wait when reading (in seconds)
-DEFAULT_TIMEOUT = 10
+DEFAULT_TIMEOUT = 1.0
 
 logger = logging.getLogger(__name__)
 
+# Pre-compiled regexes for supported syscalls
+EXECVE_PATTERN = re.compile(r"(\S+)\s+\[(\d+)\]\s+.*execve.*\((.*?)\)")
+FORK_PATTERN = re.compile(r"(\S+)\s+\[(\d+)\]\s+.*fork.*child_pid=(\d+)")
+CLONE_PATTERN = re.compile(r"(\S+)\s+\[(\d+)\]\s+.*clone.*child_pid=(\d+)\s+flags=(\S+)")
+CONNECT_PATTERN = re.compile(r"(\S+)\s+\[(\d+)\]\s+.*connect.*fd=(\d+)\s+addr=(.+)")
+
+from .domain.models.events import ExecveEvent, ForkEvent, CloneEvent, ConnectEvent, BaseSyscallEvent
 
 class TraceReader:
     """
@@ -121,7 +129,34 @@ def _reopen_if_needed(self) -> bool:
                 return False
         return True
 
-    def __iter__(self) -> Generator[str, None, None]:
+    def _parse_line(self, line: str) -> Optional[BaseSyscallEvent]:
+        """Attempt to parse a supported syscall line into a Pydantic model."""
+        # execve
+        if m := EXECVE_PATTERN.search(line):
+            ts, pid_str, cmd_args = m.groups()
+            pid = int(pid_str)
+            parts = cmd_args.split() if cmd_args else []
+            if parts:
+                return ExecveEvent(timestamp=ts, pid=pid, command=parts[0].strip('"'), args=parts[1:])
+
+        # fork
+        if m := FORK_PATTERN.search(line):
+            ts, pid_str, child_pid_str = m.groups()
+            return ForkEvent(timestamp=ts, pid=int(pid_str), child_pid=int(child_pid_str))
+
+        # clone
+        if m := CLONE_PATTERN.search(line):
+            ts, pid_str, child_pid_str, flags = m.groups()
+            return CloneEvent(timestamp=ts, pid=int(pid_str), child_pid=int(child_pid_str), flags=flags)
+
+        # connect
+        if m := CONNECT_PATTERN.search(line):
+            ts, pid_str, fd_str, addr = m.groups()
+            return ConnectEvent(timestamp=ts, pid=int(pid_str), fd=int(fd_str), address=addr)
+
+        return None
+
+    def __iter__(self) -> Generator[Union[str, BaseSyscallEvent], None, None]:
         """
         Iterate over lines from the trace pipe.
 
@@ -168,8 +203,12 @@ def __iter__(self) -> Generator[str, None, None]:
                                 text = data.decode("utf-8", errors="replace")
 
                             for line in text.splitlines():
-                                if line.strip():  # Skip empty lines
-                                    yield line
+                                if not line.strip():
+                                    continue
+
+                                parsed_evt = self._parse_line(line)
+                                # Yield the parsed object if recognized, else the raw line for backward-compat.
+                                yield parsed_evt if parsed_evt else line
                         except OSError as e:
                             if e.errno in (errno.EAGAIN, errno.EWOULDBLOCK):
                                 continue
diff --git a/tests/test_app.py b/tests/test_app.py
@@ -11,7 +11,7 @@
     SyscallTracer,
 )
 
-from linux_edr.domain.models.event_models import ExecveEvent
+from linux_edr.domain.models.events import ExecveEvent
 
 
 class TestApp(unittest.TestCase):
@@ -266,30 +266,25 @@ def test_process_event(self, mock_parse_execve, mock_log_info, mock_log_debug):
         # Import the method to test it independently
         from linux_edr.app import LinuxEDRApp
 
-        # Call the method directly
-        LinuxEDRApp._process_event(app, "test_event")
+        # Call the method directly with a validated event
+        LinuxEDRApp._process_event(app, parsed_event)
 
         # The aggregator should receive a validated dict version of the parsed event
-        expected_dict = {
-            "timestamp": "12345.6789",
-            "pid": 1000,
-            "command": "test_cmd",
-            "args": ["-a", "-b"],
-        }
+        expected_dict = parsed_event.model_dump()
         app.agg.add.assert_called_once_with(expected_dict)
 
         # Reset mock and test with verbose_debug=False
         mock_log_debug.reset_mock()
         app.verbose_debug = False
 
-        LinuxEDRApp._process_event(app, "test_event2")
+        LinuxEDRApp._process_event(app, parsed_event)
         app.agg.add.assert_called_with(expected_dict)
 
         # Test with debug=False
         mock_log_debug.reset_mock()
         app.debug = False
 
-        LinuxEDRApp._process_event(app, "test_event3")
+        LinuxEDRApp._process_event(app, parsed_event)
         app.agg.add.assert_called_with(expected_dict)
         mock_log_debug.assert_not_called()
 
diff --git a/tests/test_run.py b/tests/test_run.py
@@ -2,6 +2,7 @@
 from unittest.mock import patch, MagicMock, call
 import time
 from linux_edr.app import LinuxEDRApp
+from linux_edr.domain.models.events import ExecveEvent
 
 
 class TestAppRun(unittest.TestCase):
@@ -31,8 +32,8 @@ def test_run_normal_operation(
         mock_agg_instance = mock_aggregator.return_value
 
         # Prepare trace reader to return two events then raise KeyboardInterrupt
-        event1 = {"command": "ls", "args": ["-la"], "pid": 1000}
-        event2 = {"command": "cat", "args": ["/etc/passwd"], "pid": 1001}
+        event1 = ExecveEvent(timestamp="t1", pid=1000, command="ls", args=["-la"])
+        event2 = ExecveEvent(timestamp="t2", pid=1001, command="cat", args=["/etc/passwd"])
 
         # Set up the iterable to yield two events then stop
         mock_reader_instance.__iter__.return_value = iter([event1, event2, None])
@@ -47,7 +48,7 @@ def test_run_normal_operation(
         mock_scheduler_instance.start.assert_called_once()
 
         # Verify events were added to aggregator
-        mock_agg_instance.add.assert_has_calls([call(event1), call(event2)])
+        mock_agg_instance.add.assert_has_calls([call(event1.model_dump()), call(event2.model_dump())])
 
         # Verify logging
         mock_logging.info.assert_any_call("Scheduler started")
@@ -80,7 +81,7 @@ def test_run_keyboard_interrupt(
 
         # Make reader iterator raise KeyboardInterrupt
         def iter_side_effect():
-            yield {"command": "ls", "args": ["-la"], "pid": 1000}
+            yield ExecveEvent(timestamp="t1", pid=1000, command="ls", args=["-la"])
             raise KeyboardInterrupt()
 
         mock_reader_instance.__iter__.return_value = iter_side_effect()