diff --git a/config/pipeline_config_default.yaml b/config/pipeline_config_default.yaml index 84a53df..4689bc8 100644 --- a/config/pipeline_config_default.yaml +++ b/config/pipeline_config_default.yaml @@ -82,3 +82,20 @@ detectors: method_type: new_event_detector auto_config: False params: {} + + CharsetDetector: + method_type: charset_detector + auto_config: False + params: {} + events: + 1: + test: + params: {} + variables: + - pos: 0 + name: var1 + params: + threshold: 0. + header_variables: + - pos: level + params: {} diff --git a/docs/detectors.md b/docs/detectors.md index 668b527..88f395f 100644 --- a/docs/detectors.md +++ b/docs/detectors.md @@ -89,6 +89,7 @@ List of detectors: * [Combo Detector](detectors/combo.md): Detect new combination of variables in the logs. * [New Event](detectors/new_event.md): Detect new events in the variables in the logs. * [Rule Based](detectors/rule_based.md): Detect anomalies based in a set of rules. +* [Charset](detectors/charset.md): Detect new characters in the variables in the logs. ## Configuration diff --git a/docs/detectors/charset.md b/docs/detectors/charset.md new file mode 100644 index 0000000..99f7eb6 --- /dev/null +++ b/docs/detectors/charset.md @@ -0,0 +1,62 @@ +# New Value Detector + +The Charset Detector raises alerts when previously unseen characters appear in configured fields. It is useful to detect novelty, configuration drift, or the appearance of new actors in the environment. + +| | Schema | Description | +|------------|----------------------------|--------------------| +| **Input** | [ParserSchema](../schemas.md) | Structured log | +| **Output** | [DetectorSchema](../schemas.md) | Alert / finding | + +## Description + +This detector maintains a lightweight set of observed characters per monitored field and emits an alert when a character not present in the set is seen for the first time (subject to configuration). + + +## Configuration example + +```yaml +detectors: + CharsetDetector: + method_type: charset_detector + auto_config: False + params: {} + events: + 1: + test: + params: {} + variables: + - pos: 0 + name: var1 + params: + threshold: 0. + header_variables: + - pos: level + params: {} +``` + + +## Example usage + +```python +from detectmatelibrary.detectors.charset_detector import CharsetDetector +import detectmatelibrary.schemas as schemas + +detector = CharsetDetector(name="CharsetTest", config=cfg) + +parsed_data = schemas.ParserSchema({ + "parserType": "test", + "EventID": 1, + "template": "test template", + "variables": ["var1"], + "logID": "1", + "parsedLogID": "1", + "parserID": "test_parser", + "log": "test log message", + "logFormatVariables": {"timestamp": "123456"} +}) + + +alert = detector.process(parsed_data) +``` + +Go back [Index](../index.md) diff --git a/src/detectmatelibrary/detectors/__init__.py b/src/detectmatelibrary/detectors/__init__.py index c10328e..1bba37a 100644 --- a/src/detectmatelibrary/detectors/__init__.py +++ b/src/detectmatelibrary/detectors/__init__.py @@ -1,6 +1,7 @@ from .random_detector import RandomDetector, RandomDetectorConfig from .new_value_detector import NewValueDetector, NewValueDetectorConfig from .new_event_detector import NewEventDetector, NewEventDetectorConfig +from .charset_detector import CharsetDetector, CharsetDetectorConfig __all__ = [ "random_detector", @@ -9,5 +10,7 @@ "NewValueDetectorConfig", "RandomDetector", "NewEventDetector", - "NewEventDetectorConfig" + "NewEventDetectorConfig", + "CharsetDetector", + "CharsetDetectorConfig" ] diff --git a/src/detectmatelibrary/detectors/charset_detector.py b/src/detectmatelibrary/detectors/charset_detector.py new file mode 100644 index 0000000..91e128b --- /dev/null +++ b/src/detectmatelibrary/detectors/charset_detector.py @@ -0,0 +1,158 @@ +from detectmatelibrary.common._config._compile import generate_detector_config +from detectmatelibrary.common._config._formats import EventsConfig +from detectmatelibrary.common.detector import ( + CoreDetectorConfig, + CoreDetector, + get_configured_variables, + get_global_variables, + validate_config_coverage, +) +from detectmatelibrary.utils.persistency.event_data_structures.trackers.stability.stability_tracker import ( + EventStabilityTracker +) +from detectmatelibrary.utils.persistency.event_persistency import EventPersistency +from detectmatelibrary.utils.data_buffer import BufferMode +from detectmatelibrary.schemas import ParserSchema, DetectorSchema +from detectmatelibrary.constants import GLOBAL_EVENT_ID +from typing_extensions import override +from tools.logging import logger + + +class CharsetDetectorConfig(CoreDetectorConfig): + method_type: str = "charset_detector" + + use_stable_vars: bool = True + use_static_vars: bool = True + + +class CharsetDetector(CoreDetector): + """Detect new values in log data as anomalies based on learned values.""" + + def __init__( + self, + name: str = "CharsetDetector", + config: CharsetDetectorConfig = CharsetDetectorConfig() + ) -> None: + + if isinstance(config, dict): + config = CharsetDetectorConfig.from_dict(config, name) + + super().__init__(name=name, buffer_mode=BufferMode.NO_BUF, config=config) + self.config: CharsetDetectorConfig # type narrowing for IDE + self.persistency = EventPersistency( + event_data_class=EventStabilityTracker, + event_data_kwargs={"expand_value": True}, + ) + # auto config checks if individual variables are stable to select combos from + self.auto_conf_persistency = EventPersistency(event_data_class=EventStabilityTracker) + self._register_persistency(self.persistency) + + def train(self, input_: ParserSchema) -> None: # type: ignore + """Train the detector by learning characters from the input data.""" + configured_variables = get_configured_variables(input_, self.config.events) + self.persistency.ingest_event( + event_id=input_["EventID"], + event_template=input_["template"], + named_variables=configured_variables + ) + if self.config.global_instances: + global_vars = get_global_variables(input_, self.config.global_instances) + if global_vars: + self.persistency.ingest_event( + event_id=GLOBAL_EVENT_ID, + event_template=input_["template"], + named_variables=global_vars + ) + + def detect( + self, input_: ParserSchema, output_: DetectorSchema # type: ignore + ) -> bool: + """Detect characters in the input data that were not seen in + training.""" + alerts: dict[str, str] = {} + configured_variables = get_configured_variables(input_, self.config.events) + overall_score = 0.0 + + current_event_id = input_["EventID"] + known_events = self.persistency.get_events_data() + + if current_event_id in known_events: + event_tracker = known_events[current_event_id] + for var_name, single_tracker in event_tracker.get_data().items(): + v = configured_variables.get(var_name) + if v is None: + continue + unknown = set(v) - single_tracker.unique_set + if unknown: + alerts[f"EventID {current_event_id} - {var_name}"] = ( + "Unknown character(s): " + + ", ".join(f"'{c}'" for c in sorted(unknown)) + ) + overall_score += 1.0 + + if self.config.global_instances and GLOBAL_EVENT_ID in known_events: + global_vars = get_global_variables(input_, self.config.global_instances) + global_tracker = known_events[GLOBAL_EVENT_ID] + for var_name, single_tracker in global_tracker.get_data().items(): + v = global_vars.get(var_name) + if v is None: + continue + unknown = set(v) - single_tracker.unique_set + if unknown: + alerts[f"Global - {var_name}"] = ( + "Unknown character(s): " + + ", ".join(f"'{c}'" for c in sorted(unknown)) + ) + overall_score += 1.0 + + if overall_score > 0: + output_["score"] = overall_score + output_["description"] = ( + f"{self.name} detects characters not encountered in training as anomalies." + ) + output_["alertsObtain"].update(alerts) + return True + + return False + + def configure(self, input_: ParserSchema) -> None: # type: ignore + self.auto_conf_persistency.ingest_event( + event_id=input_["EventID"], + event_template=input_["template"], + variables=input_["variables"], + named_variables=input_["logFormatVariables"], + ) + + @override + def post_train(self) -> None: + if not self.config.auto_config: + validate_config_coverage(self.name, self.config.events, self.persistency) + + def set_configuration(self) -> None: + variables = {} + for event_id, tracker in self.auto_conf_persistency.get_events_data().items(): + stable = [] + if self.config.use_stable_vars: + stable = tracker.get_features_by_classification("STABLE") # type: ignore + static = [] + if self.config.use_static_vars: + static = tracker.get_features_by_classification("STATIC") # type: ignore + vars_ = stable + static + if len(vars_) > 0: + variables[event_id] = vars_ + old_persist = self.config.persist + config_dict = generate_detector_config( + variable_selection=variables, + detector_name=self.name, + method_type=self.config.method_type, + ) + # Update the config object from the dictionary instead of replacing it + self.config = CharsetDetectorConfig.from_dict(config_dict, self.name) + self.config.persist = old_persist + events = self.config.events + if isinstance(events, EventsConfig) and not events.events: + logger.warning( + f"[{self.name}] auto_config=True generated an empty configuration. " + "No stable variables were found in configure-phase data. " + "The detector will produce no alerts." + ) diff --git a/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/base/event_tracker.py b/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/base/event_tracker.py index a267062..f43a22a 100644 --- a/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/base/event_tracker.py +++ b/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/base/event_tracker.py @@ -18,7 +18,7 @@ class EventTracker(EventDataStructure): def __init__( self, - single_tracker_type: Type[SingleTracker] = SingleTracker, + single_tracker_type: Callable[[], SingleTracker] = SingleTracker, multi_tracker_type: Type[MultiTracker] = MultiTracker, converter_function: Callable[[Any], Any] = lambda x: x, ) -> None: @@ -66,6 +66,17 @@ def dump(self) -> bytes: def load(cls, data: bytes, **kwargs: Any) -> "EventTracker": """Restore tracker state from MessagePack bytes. + Reconstruction follows two paths. When ``cls is EventTracker`` the + legacy path runs: ``__new__`` allocates the instance and the base + ``__init__`` is invoked directly with the ``single_tracker_type`` and + ``multi_tracker_type`` recorded in the snapshot. For any subclass, + ``cls(**kwargs)`` is called instead, which lets subclasses with + closure-based factories (e.g. ``EventStabilityTracker``'s + ``expand_value``) rebuild their factory so it survives load. + + Contract for subclasses: ``__init__`` must accept the kwargs forwarded + to ``load()`` and must not require additional positional arguments. + Note: event_id and template (base dataclass fields) are not restored; they remain at defaults (-1 and "") as they are managed by EventPersistency. """ @@ -89,12 +100,15 @@ def _list_keys_to_tuples(pairs: list[tuple[Any, Any]]) -> dict[Any, Any]: importlib.import_module(state["multi_tracker_module"]), state["multi_tracker_type"], ) - instance = cls.__new__(cls) - EventTracker.__init__( - instance, - single_tracker_type=single_tracker_cls, - multi_tracker_type=multi_tracker_cls, - ) + if cls is EventTracker: + instance = cls.__new__(cls) + EventTracker.__init__( + instance, + single_tracker_type=single_tracker_cls, + multi_tracker_type=multi_tracker_cls, + ) + else: + instance = cls(**kwargs) for name, tracker_state in state["trackers"].items(): instance.multi_tracker.single_trackers[name] = single_tracker_cls.from_state(tracker_state) return instance diff --git a/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/base/multi_tracker.py b/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/base/multi_tracker.py index d6fea78..08df821 100644 --- a/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/base/multi_tracker.py +++ b/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/base/multi_tracker.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Type +from typing import Any, Callable, Dict from detectmatelibrary.utils.preview_helpers import format_dict_repr @@ -9,9 +9,9 @@ class MultiTracker: """Tracks multiple features (e.g. variables or variable combos) using individual trackers.""" - def __init__(self, single_tracker_type: Type[SingleTracker] = SingleTracker) -> None: + def __init__(self, single_tracker_type: Callable[[], SingleTracker] = SingleTracker) -> None: self.single_trackers: Dict[str, SingleTracker] = {} - self.single_tracker_type: Type[SingleTracker] = single_tracker_type + self.single_tracker_type: Callable[[], SingleTracker] = single_tracker_type def add_data(self, data_object: Dict[str, Any]) -> None: """Add data to the appropriate feature trackers.""" diff --git a/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/stability/stability_tracker.py b/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/stability/stability_tracker.py index d4b1235..531f01c 100644 --- a/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/stability/stability_tracker.py +++ b/src/detectmatelibrary/utils/persistency/event_data_structures/trackers/stability/stability_tracker.py @@ -12,20 +12,21 @@ class SingleStabilityTracker(SingleTracker): """Tracks stability of a single feature.""" - def __init__(self, min_samples: int = 3) -> None: + def __init__(self, min_samples: int = 3, expand_value: bool = False) -> None: self.min_samples = min_samples + self.expand_value = expand_value self.change_series: RLEList[bool] = RLEList() self.unique_set: Set[Any] = set() self.stability_classifier: StabilityClassifier = StabilityClassifier( segment_thresholds=[1.1, 0.3, 0.1, 0.01], ) + self._accum = set.update if expand_value else set.add def add_value(self, value: Any) -> None: """Add a new value to the tracker.""" - unique_set_size_before = len(self.unique_set) - self.unique_set.add(value) - has_changed = len(self.unique_set) - unique_set_size_before > 0 - self.change_series.append(has_changed) + before = len(self.unique_set) + self._accum(self.unique_set, value) + self.change_series.append(len(self.unique_set) > before) def classify(self) -> Classification: """Classify the variable.""" @@ -65,6 +66,7 @@ def to_state(self) -> Dict[str, Any]: "type": self.__class__.__name__, "module": self.__class__.__module__, "min_samples": self.min_samples, + "expand_value": self.expand_value, "runs": self.change_series.runs(), "unique_set": list(self.unique_set), "segment_thresholds": self.stability_classifier.segment_threshs, @@ -73,7 +75,10 @@ def to_state(self) -> Dict[str, Any]: @classmethod def from_state(cls, state: Dict[str, Any]) -> "SingleStabilityTracker": """Restore tracker from a state dict produced by to_state().""" - tracker = cls(min_samples=state["min_samples"]) + tracker = cls( + min_samples=state["min_samples"], + expand_value=state.get("expand_value", False), + ) runs = [(bool(r[0]), int(r[1])) for r in state["runs"]] tracker.change_series._runs = runs tracker.change_series._len = sum(count for _, count in runs) @@ -118,10 +123,23 @@ class EventStabilityTracker(EventTracker): """Event data structure that tracks the stability of each event over time / number of events.""" - def __init__(self, converter_function: Callable[[Any], Any] = lambda x: x) -> None: - self.multi_tracker: MultiStabilityTracker # for type hinting + def __init__( + self, + converter_function: Callable[[Any], Any] = lambda x: x, + expand_value: bool = False, + ) -> None: + self.multi_tracker: MultiStabilityTracker # for type hinting + + def make_tracker() -> SingleStabilityTracker: + return SingleStabilityTracker(expand_value=expand_value) + + # Mirror class identity onto the closure so dump()/load() can resolve + # the underlying SingleStabilityTracker via its module + qualname. + make_tracker.__name__ = SingleStabilityTracker.__name__ + make_tracker.__module__ = SingleStabilityTracker.__module__ + super().__init__( - single_tracker_type=SingleStabilityTracker, + single_tracker_type=make_tracker, multi_tracker_type=MultiStabilityTracker, converter_function=converter_function, ) diff --git a/tests/test_detectors/test_charset_detector.py b/tests/test_detectors/test_charset_detector.py new file mode 100644 index 0000000..106cf5e --- /dev/null +++ b/tests/test_detectors/test_charset_detector.py @@ -0,0 +1,422 @@ +"""Tests for CharsetDetector class. + +This module tests the CharsetDetector implementation including: +- Initialization and configuration +- Training functionality to learn known characters +- Detection logic for new/unknown characters +- Event-specific configuration handling +- Input/output schema validation +""" + +from detectmatelibrary.common._core_op._fit_logic import TrainState +from detectmatelibrary.detectors.charset_detector import CharsetDetector, CharsetDetectorConfig, BufferMode +from detectmatelibrary.common._core_op._fit_logic import ConfigState +from detectmatelibrary.constants import GLOBAL_EVENT_ID +from detectmatelibrary.parsers.template_matcher import MatcherParser +from detectmatelibrary.helper.from_to import From +import detectmatelibrary.schemas as schemas + +from detectmatelibrary.utils.aux import time_test_mode + +# Set time test mode for consistent timestamps +time_test_mode() + + +config = { + "detectors": { + "CustomInit": { + "method_type": "charset_detector", + "auto_config": False, + "params": {}, + "events": { + 1: { + "instance1": { + "params": {}, + "variables": [{ + "pos": 0, "name": "sad", "params": {} + }] + } + } + } + }, + "MultipleDetector": { + "method_type": "charset_detector", + "auto_config": False, + "params": {}, + "events": { + 1: { + "test": { + "params": {}, + "variables": [{ + "pos": 1, "name": "test", "params": {} + }], + "header_variables": [{ + "pos": "level", "params": {} + }] + } + } + } + } + } +} + + +class TestCharsetDetectorInitialization: + """Test CharsetDetector initialization and configuration.""" + + def test_default_initialization(self): + """Test detector initialization with default parameters.""" + detector = CharsetDetector() + + assert detector.name == "CharsetDetector" + assert hasattr(detector, 'config') + assert detector.data_buffer.mode == BufferMode.NO_BUF + assert detector.input_schema == schemas.ParserSchema + assert detector.output_schema == schemas.DetectorSchema + assert hasattr(detector, 'persistency') + + def test_custom_config_initialization(self): + """Test detector initialization with custom configuration.""" + detector = CharsetDetector(name="CustomInit", config=config) + + assert detector.name == "CustomInit" + assert hasattr(detector, 'persistency') + assert isinstance(detector.persistency.events_data, dict) + + def test_persistency_uses_expand_value(self): + """Main persistency must accumulate characters; auto_conf must not.""" + detector = CharsetDetector() + # Ingest a sample so a SingleStabilityTracker is materialized + detector.persistency.ingest_event( + event_id=1, + event_template="t", + named_variables={"v": "hello"}, + ) + single = detector.persistency.get_event_data(1)["v"] + assert single.expand_value is True + assert single.unique_set == {"h", "e", "l", "o"} + + def test_auto_conf_persistency_does_not_expand(self): + detector = CharsetDetector() + detector.auto_conf_persistency.ingest_event( + event_id=1, + event_template="t", + named_variables={"v": "hello"}, + ) + single = detector.auto_conf_persistency.get_event_data(1)["v"] + assert single.expand_value is False + assert single.unique_set == {"hello"} + + def test_register_persistency_was_called(self): + """Main persistency should be registered so persist/load round-trips + work.""" + from detectmatelibrary.common.detector import PersistConfig + cfg = CharsetDetectorConfig( + persist=PersistConfig(path="memory://charset_regpersist/state") + ) + detector = CharsetDetector(config=cfg) + # _register_persistency builds a PersistencySaver bound to detector.persistency + assert detector.saver is not None + assert detector.saver._persistency is detector.persistency + detector.saver.stop() + + +class TestCharsetDetectorTraining: + """Test CharsetDetector training functionality.""" + + def test_train_multiple_values(self): + """Test training with multiple different values.""" + detector = CharsetDetector(config=config, name="MultipleDetector") + # Train with multiple values (only event 1 should be tracked per config) + for event in range(3): + for level in ["INFO", "WARNING", "ERROR"]: + parser_data = schemas.ParserSchema({ + "parserType": "test", + "EventID": event, + "template": "test template", + "variables": ["0", "assa"], + "logID": "1", + "parsedLogID": "1", + "parserID": "test_parser", + "log": "test log message", + "logFormatVariables": {"level": level} + }) + detector.train(parser_data) + + # Only event 1 should be tracked (based on events config) + assert len(detector.persistency.events_data) == 1 + event_data = detector.persistency.get_event_data(1) + assert event_data is not None + # With expand_value=True, unique_set contains individual characters + assert set("INFO") <= event_data["level"].unique_set + assert set("WARNING") <= event_data["level"].unique_set + assert set("ERROR") <= event_data["level"].unique_set + assert set("assa") <= event_data["test"].unique_set + + +class TestCharsetDetectorDetection: + """Test CharsetDetector detection functionality.""" + + def test_detect_known_value_no_alert(self): + detector = CharsetDetector(config=config, name="MultipleDetector") + + # Train with a value + train_data = schemas.ParserSchema({ + "parserType": "test", + "EventID": 1, + "template": "test template", + "variables": ["adsasd", "asdasd"], + "logID": "1", + "parsedLogID": "1", + "parserID": "test_parser", + "log": "test log message", + "logFormatVariables": {"level": "INFO"} + }) + detector.train(train_data) + + # Detect with the same value + test_data = schemas.ParserSchema({ + "parserType": "test", + "EventID": 12, + "template": "test template", + "variables": ["adsasddddddaaa"], + "logID": "2", + "parsedLogID": "2", + "parserID": "test_parser", + "log": "test log message", + "logFormatVariables": {"level": "CRITICAL"} + }) + output = schemas.DetectorSchema() + + result = detector.detect(test_data, output) + + assert not result + assert output.score == 0.0 + + def test_detect_known_value_alert(self): + detector = CharsetDetector(config=config, name="MultipleDetector") + + # Train with a value + train_data = schemas.ParserSchema({ + "parserType": "test", + "EventID": 1, + "template": "test template", + "variables": ["adsasd", "asdasd"], + "logID": "1", + "parsedLogID": "1", + "parserID": "test_parser", + "log": "test log message", + "logFormatVariables": {"level": "INFO"} + }) + detector.train(train_data) + + # Detect with the same value + test_data = schemas.ParserSchema({ + "parserType": "test", + "EventID": 1, + "template": "test template", + "variables": ["asas", "adsd"], + "logID": "2", + "parsedLogID": "2", + "parserID": "test_parser", + "log": "test log message", + "logFormatVariables": {"level": "CRITICAL"} + }) + output = schemas.DetectorSchema() + + result = detector.detect(test_data, output) + + assert result + assert output.score == 1.0 + + def test_detect_unknown_chars_reported_per_variable(self): + """Train on a known alphabet; detect a value with unknown chars and + confirm the alert string lists the unknown chars sorted.""" + cfg = { + "detectors": { + "Single": { + "method_type": "charset_detector", + "auto_config": False, + "params": {}, + "events": { + 1: { + "test": { + "params": {}, + "variables": [{"pos": 0, "name": "v", "params": {}}], + } + } + }, + } + } + } + detector = CharsetDetector(config=cfg, name="Single") + + train = schemas.ParserSchema({ + "parserType": "test", "EventID": 1, "template": "t", + "variables": ["abc"], "logID": "1", "parsedLogID": "1", + "parserID": "p", "log": "l", "logFormatVariables": {}, + }) + detector.train(train) + + # All known chars + ok = schemas.ParserSchema({ + "parserType": "test", "EventID": 1, "template": "t", + "variables": ["cba"], "logID": "2", "parsedLogID": "2", + "parserID": "p", "log": "l", "logFormatVariables": {}, + }) + out = schemas.DetectorSchema() + assert not detector.detect(ok, out) + assert out.score == 0.0 + + # Unknown chars 'x' and 'y' + bad = schemas.ParserSchema({ + "parserType": "test", "EventID": 1, "template": "t", + "variables": ["axy"], "logID": "3", "parsedLogID": "3", + "parserID": "p", "log": "l", "logFormatVariables": {}, + }) + out = schemas.DetectorSchema() + assert detector.detect(bad, out) + assert out.score == 1.0 + assert any("'x'" in msg and "'y'" in msg for msg in out["alertsObtain"].values()) + + +_PARSER_CONFIG = { + "parsers": { + "MatcherParser": { + "method_type": "matcher_parser", + "auto_config": False, + "log_format": "type= msg=audit(