Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions config/pipeline_config_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,20 @@ detectors:
method_type: new_event_detector
auto_config: False
params: {}

CharsetDetector:
method_type: charset_detector
auto_config: False
params: {}
events:
1:
test:
params: {}
variables:
- pos: 0
name: var1
params:
threshold: 0.
header_variables:
- pos: level
params: {}
1 change: 1 addition & 0 deletions docs/detectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ List of detectors:
* [Combo Detector](detectors/combo.md): Detect new combination of variables in the logs.
* [New Event](detectors/new_event.md): Detect new events in the variables in the logs.
* [Rule Based](detectors/rule_based.md): Detect anomalies based in a set of rules.
* [Charset](detectors/charset.md): Detect new characters in the variables in the logs.

## Configuration

Expand Down
62 changes: 62 additions & 0 deletions docs/detectors/charset.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# New Value Detector

The Charset Detector raises alerts when previously unseen characters appear in configured fields. It is useful to detect novelty, configuration drift, or the appearance of new actors in the environment.

| | Schema | Description |
|------------|----------------------------|--------------------|
| **Input** | [ParserSchema](../schemas.md) | Structured log |
| **Output** | [DetectorSchema](../schemas.md) | Alert / finding |

## Description

This detector maintains a lightweight set of observed characters per monitored field and emits an alert when a character not present in the set is seen for the first time (subject to configuration).


## Configuration example

```yaml
detectors:
CharsetDetector:
method_type: charset_detector
auto_config: False
params: {}
events:
1:
test:
params: {}
variables:
- pos: 0
name: var1
params:
threshold: 0.
header_variables:
- pos: level
params: {}
```


## Example usage

```python
from detectmatelibrary.detectors.charset_detector import CharsetDetector
import detectmatelibrary.schemas as schemas

detector = CharsetDetector(name="CharsetTest", config=cfg)

parsed_data = schemas.ParserSchema({
"parserType": "test",
"EventID": 1,
"template": "test template",
"variables": ["var1"],
"logID": "1",
"parsedLogID": "1",
"parserID": "test_parser",
"log": "test log message",
"logFormatVariables": {"timestamp": "123456"}
})


alert = detector.process(parsed_data)
```

Go back [Index](../index.md)
5 changes: 4 additions & 1 deletion src/detectmatelibrary/detectors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .random_detector import RandomDetector, RandomDetectorConfig
from .new_value_detector import NewValueDetector, NewValueDetectorConfig
from .new_event_detector import NewEventDetector, NewEventDetectorConfig
from .charset_detector import CharsetDetector, CharsetDetectorConfig

__all__ = [
"random_detector",
Expand All @@ -9,5 +10,7 @@
"NewValueDetectorConfig",
"RandomDetector",
"NewEventDetector",
"NewEventDetectorConfig"
"NewEventDetectorConfig",
"CharsetDetector",
"CharsetDetectorConfig"
]
158 changes: 158 additions & 0 deletions src/detectmatelibrary/detectors/charset_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
from detectmatelibrary.common._config._compile import generate_detector_config
from detectmatelibrary.common._config._formats import EventsConfig
from detectmatelibrary.common.detector import (
CoreDetectorConfig,
CoreDetector,
get_configured_variables,
get_global_variables,
validate_config_coverage,
)
from detectmatelibrary.utils.persistency.event_data_structures.trackers.stability.stability_tracker import (
EventStabilityTracker
)
from detectmatelibrary.utils.persistency.event_persistency import EventPersistency
from detectmatelibrary.utils.data_buffer import BufferMode
from detectmatelibrary.schemas import ParserSchema, DetectorSchema
from detectmatelibrary.constants import GLOBAL_EVENT_ID
from typing_extensions import override
from tools.logging import logger


class CharsetDetectorConfig(CoreDetectorConfig):
method_type: str = "charset_detector"

use_stable_vars: bool = True
use_static_vars: bool = True


class CharsetDetector(CoreDetector):
"""Detect new values in log data as anomalies based on learned values."""

def __init__(
self,
name: str = "CharsetDetector",
config: CharsetDetectorConfig = CharsetDetectorConfig()
) -> None:

if isinstance(config, dict):
config = CharsetDetectorConfig.from_dict(config, name)

super().__init__(name=name, buffer_mode=BufferMode.NO_BUF, config=config)
self.config: CharsetDetectorConfig # type narrowing for IDE
self.persistency = EventPersistency(
event_data_class=EventStabilityTracker,
event_data_kwargs={"expand_value": True},
)
# auto config checks if individual variables are stable to select combos from
self.auto_conf_persistency = EventPersistency(event_data_class=EventStabilityTracker)
self._register_persistency(self.persistency)

def train(self, input_: ParserSchema) -> None: # type: ignore
"""Train the detector by learning characters from the input data."""
configured_variables = get_configured_variables(input_, self.config.events)
self.persistency.ingest_event(
event_id=input_["EventID"],
event_template=input_["template"],
named_variables=configured_variables
)
if self.config.global_instances:
global_vars = get_global_variables(input_, self.config.global_instances)
if global_vars:
self.persistency.ingest_event(
event_id=GLOBAL_EVENT_ID,
event_template=input_["template"],
named_variables=global_vars
)

def detect(
self, input_: ParserSchema, output_: DetectorSchema # type: ignore
) -> bool:
"""Detect characters in the input data that were not seen in
training."""
alerts: dict[str, str] = {}
configured_variables = get_configured_variables(input_, self.config.events)
overall_score = 0.0

current_event_id = input_["EventID"]
known_events = self.persistency.get_events_data()

if current_event_id in known_events:
event_tracker = known_events[current_event_id]
for var_name, single_tracker in event_tracker.get_data().items():
v = configured_variables.get(var_name)
if v is None:
continue
unknown = set(v) - single_tracker.unique_set
if unknown:
alerts[f"EventID {current_event_id} - {var_name}"] = (
"Unknown character(s): "
+ ", ".join(f"'{c}'" for c in sorted(unknown))
)
overall_score += 1.0

if self.config.global_instances and GLOBAL_EVENT_ID in known_events:
global_vars = get_global_variables(input_, self.config.global_instances)
global_tracker = known_events[GLOBAL_EVENT_ID]
for var_name, single_tracker in global_tracker.get_data().items():
v = global_vars.get(var_name)
if v is None:
continue
unknown = set(v) - single_tracker.unique_set
if unknown:
alerts[f"Global - {var_name}"] = (
"Unknown character(s): "
+ ", ".join(f"'{c}'" for c in sorted(unknown))
)
overall_score += 1.0

if overall_score > 0:
output_["score"] = overall_score
output_["description"] = (
f"{self.name} detects characters not encountered in training as anomalies."
)
output_["alertsObtain"].update(alerts)
return True

return False

def configure(self, input_: ParserSchema) -> None: # type: ignore
self.auto_conf_persistency.ingest_event(
event_id=input_["EventID"],
event_template=input_["template"],
variables=input_["variables"],
named_variables=input_["logFormatVariables"],
)

@override
def post_train(self) -> None:
if not self.config.auto_config:
validate_config_coverage(self.name, self.config.events, self.persistency)

def set_configuration(self) -> None:
variables = {}
for event_id, tracker in self.auto_conf_persistency.get_events_data().items():
stable = []
if self.config.use_stable_vars:
stable = tracker.get_features_by_classification("STABLE") # type: ignore
static = []
if self.config.use_static_vars:
static = tracker.get_features_by_classification("STATIC") # type: ignore
vars_ = stable + static
if len(vars_) > 0:
variables[event_id] = vars_
old_persist = self.config.persist
config_dict = generate_detector_config(
variable_selection=variables,
detector_name=self.name,
method_type=self.config.method_type,
)
# Update the config object from the dictionary instead of replacing it
self.config = CharsetDetectorConfig.from_dict(config_dict, self.name)
self.config.persist = old_persist
events = self.config.events
if isinstance(events, EventsConfig) and not events.events:
logger.warning(
f"[{self.name}] auto_config=True generated an empty configuration. "
"No stable variables were found in configure-phase data. "
"The detector will produce no alerts."
)
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class EventTracker(EventDataStructure):

def __init__(
self,
single_tracker_type: Type[SingleTracker] = SingleTracker,
single_tracker_type: Callable[[], SingleTracker] = SingleTracker,
multi_tracker_type: Type[MultiTracker] = MultiTracker,
converter_function: Callable[[Any], Any] = lambda x: x,
) -> None:
Expand Down Expand Up @@ -66,6 +66,17 @@ def dump(self) -> bytes:
def load(cls, data: bytes, **kwargs: Any) -> "EventTracker":
"""Restore tracker state from MessagePack bytes.

Reconstruction follows two paths. When ``cls is EventTracker`` the
legacy path runs: ``__new__`` allocates the instance and the base
``__init__`` is invoked directly with the ``single_tracker_type`` and
``multi_tracker_type`` recorded in the snapshot. For any subclass,
``cls(**kwargs)`` is called instead, which lets subclasses with
closure-based factories (e.g. ``EventStabilityTracker``'s
``expand_value``) rebuild their factory so it survives load.

Contract for subclasses: ``__init__`` must accept the kwargs forwarded
to ``load()`` and must not require additional positional arguments.

Note: event_id and template (base dataclass fields) are not restored;
they remain at defaults (-1 and "") as they are managed by EventPersistency.
"""
Expand All @@ -89,12 +100,15 @@ def _list_keys_to_tuples(pairs: list[tuple[Any, Any]]) -> dict[Any, Any]:
importlib.import_module(state["multi_tracker_module"]),
state["multi_tracker_type"],
)
instance = cls.__new__(cls)
EventTracker.__init__(
instance,
single_tracker_type=single_tracker_cls,
multi_tracker_type=multi_tracker_cls,
)
if cls is EventTracker:
instance = cls.__new__(cls)
EventTracker.__init__(
instance,
single_tracker_type=single_tracker_cls,
multi_tracker_type=multi_tracker_cls,
)
else:
instance = cls(**kwargs)
for name, tracker_state in state["trackers"].items():
instance.multi_tracker.single_trackers[name] = single_tracker_cls.from_state(tracker_state)
return instance
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Dict, Type
from typing import Any, Callable, Dict

from detectmatelibrary.utils.preview_helpers import format_dict_repr

Expand All @@ -9,9 +9,9 @@ class MultiTracker:
"""Tracks multiple features (e.g. variables or variable combos) using
individual trackers."""

def __init__(self, single_tracker_type: Type[SingleTracker] = SingleTracker) -> None:
def __init__(self, single_tracker_type: Callable[[], SingleTracker] = SingleTracker) -> None:
self.single_trackers: Dict[str, SingleTracker] = {}
self.single_tracker_type: Type[SingleTracker] = single_tracker_type
self.single_tracker_type: Callable[[], SingleTracker] = single_tracker_type

def add_data(self, data_object: Dict[str, Any]) -> None:
"""Add data to the appropriate feature trackers."""
Expand Down
Loading
Loading