aws · zhongkechen · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/src/aws_durable_execution_sdk_python/execution.py b/src/aws_durable_execution_sdk_python/execution.py
diff --git a/src/aws_durable_execution_sdk_python/lambda_service.py b/src/aws_durable_execution_sdk_python/lambda_service.py
@@ -96,6 +96,15 @@ class OperationSubType(Enum):
     CHAINED_INVOKE = "ChainedInvoke"
 
 
+class InvocationStatus(Enum):
+    SUCCEEDED = "SUCCEEDED"
+    FAILED = "FAILED"
+    PENDING = "PENDING"
+
+    # Used internally only: the invocation failed and the backend will retry
+    RETRY = "RETRY"
+
+
 @dataclass(frozen=True)
 class ExecutionDetails:
     input_payload: str | None = None

diff --git a/src/aws_durable_execution_sdk_python/plugin.py b/src/aws_durable_execution_sdk_python/plugin.py
@@ -0,0 +1,351 @@
+import datetime
+import logging
+from abc import ABC
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from aws_durable_execution_sdk_python.lambda_service import (
+    OperationType,
+    OperationStatus,
+    OperationAction,
+    OperationSubType,
+    ErrorObject,
+    InvocationStatus,
+    Operation,
+    OperationUpdate,
+)
+from aws_durable_execution_sdk_python.types import LambdaContext
+
+if TYPE_CHECKING:
+    from aws_durable_execution_sdk_python.execution import (
+        DurableExecutionInvocationOutput,
+    )
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class OperationStartInfo:
+    operation_id: str
+    operation_type: OperationType
+    sub_type: OperationSubType | None = None
+    name: str | None = None
+    parent_id: str | None = None
+    start_timestamp: datetime.datetime | None = None
+
+
+@dataclass
+class OperationEndInfo(OperationStartInfo):
+    status: OperationStatus = OperationStatus.SUCCEEDED
+    end_timestamp: datetime.datetime | None = None
+    attempt: int | None = None
+    error: ErrorObject | None = None
+
+
+@dataclass
+class AttemptStartInfo(OperationStartInfo):
+    attempt: int = 1
+
+
+@dataclass
+class AttemptEndInfo(AttemptStartInfo):
+    succeeded: bool | None = None
+    end_timestamp: datetime.datetime | None = None
+    error: ErrorObject | None = None
+    next_attempt_delay_seconds: int | None = None
+
+
+@dataclass
+class InvocationStartInfo:
+    request_id: str | None
+    execution_arn: str | None
+    start_timestamp: datetime.datetime | None
+
+
+@dataclass
+class InvocationEndInfo(InvocationStartInfo):
+    status: InvocationStatus = InvocationStatus.SUCCEEDED
+    end_timestamp: datetime.datetime | None = None
+    error: ErrorObject | None = None
+
+
+@dataclass
+class ExecutionStartInfo(InvocationStartInfo):
+    pass
+
+
+@dataclass
+class ExecutionEndInfo(ExecutionStartInfo):
+    status: InvocationStatus = InvocationStatus.SUCCEEDED
+    end_timestamp: datetime.datetime | None = None
+    error: ErrorObject | None = None
+
+
+class DurableExecutionPlugin(ABC):
+    """Base class for plugins. Override only the methods you need."""
+
+    def on_execution_start(self, info: ExecutionStartInfo) -> None:
+        pass
+
+    def on_execution_end(self, info: ExecutionEndInfo) -> None:
+        pass
+
+    def on_invocation_start(self, info: InvocationStartInfo) -> None:
+        pass
+
+    def on_invocation_end(self, info: InvocationEndInfo) -> None:
+        pass
+
+    def on_operation_start(self, info: OperationStartInfo) -> None:
+        pass
+
+    def on_operation_end(self, info: OperationEndInfo) -> None:
+        pass
+
+    def on_operation_attempt_start(self, info: AttemptStartInfo) -> None:
+        pass
+
+    def on_operation_attempt_end(self, info: AttemptEndInfo) -> None:
+        pass
+
+    # Todo: further discussions required to finalize the following interface
+    # def enrich_log_context(self, info: OperationStartInfo | None) -> Dict[str, Any] | None: pass
+
+
+class PluginExecutor:
+    _DEFAULT_MAX_WORKERS = 4
+
+    def __init__(
+        self,
+        plugins: list[DurableExecutionPlugin] | None,
+        max_workers: int | None = None,
+    ):
+        self.plugins = plugins or []
+        self._pending_futures: list = []
+        self._executor: ThreadPoolExecutor | None = (
+            ThreadPoolExecutor(
+                max_workers=max_workers or self._DEFAULT_MAX_WORKERS,
+                thread_name_prefix="plugin-executor",
+            )
+            if self.plugins
+            else None
+        )
+
+    def close(self) -> None:
+        """Shut down the thread pool, waiting for pending tasks to complete."""
+        if self._executor is None:
+            return
+        self.flush()
+        self._executor.shutdown(wait=True)
+
+    def flush(self) -> None:
+        """Wait for all pending plugin tasks to complete. Useful for testing."""
+        for future in self._pending_futures:
+            future.result()
+        self._pending_futures.clear()
+
+    def _dispatch_plugin(self, plugin: DurableExecutionPlugin, info) -> None:
+        """Invoke the appropriate plugin callback. Runs inside the thread pool."""
+        try:
+            match info:
+                case ExecutionEndInfo():
+                    plugin.on_execution_end(info)
+                case InvocationEndInfo():
+                    plugin.on_invocation_end(info)
+                case ExecutionStartInfo():
+                    plugin.on_execution_start(info)
+                case InvocationStartInfo():
+                    plugin.on_invocation_start(info)
+                case AttemptEndInfo():
+                    plugin.on_operation_attempt_end(info)
+                case OperationEndInfo():
+                    plugin.on_operation_end(info)
+                case AttemptStartInfo():
+                    plugin.on_operation_attempt_start(info)
+                case OperationStartInfo():
+                    plugin.on_operation_start(info)
+                case _:
+                    raise ValueError(f"Unknown info type: {type(info)}")
+        except Exception:
+            # log and ignore the exception
+            logger.exception("Plugin %s exception ignored", plugin.__class__.__name__)
+
+    def execute_plugins(self, info):
+        if not self.plugins:
+            return
+        for plugin in self.plugins:
+            future = self._executor.submit(self._dispatch_plugin, plugin, info)
+            self._pending_futures.append(future)
+
+    def on_invocation_start(
+        self,
+        durable_execution_arn: str,
+        context: LambdaContext | None,
+        execution_operation: Operation | None,
+        is_replaying: bool,
+    ) -> None:
+        aws_request_id = context.aws_request_id if context else None
+        start_timestamp = (
+            execution_operation.start_timestamp if execution_operation else None
+        )
+
+        if not is_replaying:
+            self.execute_plugins(
+                ExecutionStartInfo(
+                    request_id=aws_request_id,
+                    execution_arn=durable_execution_arn,
+                    start_timestamp=start_timestamp,
+                )
+            )
+
+        self.execute_plugins(
+            InvocationStartInfo(
+                request_id=aws_request_id,
+                execution_arn=durable_execution_arn,
+                start_timestamp=start_timestamp,
+            )
+        )
+
+    def on_invocation_end(
+        self,
+        durable_execution_arn: str | None,
+        context: LambdaContext,
+        execution_operation: Operation | None,
+        output: "DurableExecutionInvocationOutput",
+    ) -> None:
+        start_timestamp = (
+            execution_operation.start_timestamp if execution_operation else None
+        )
+        # the actual end timestamp may be unknown because it's not checkpointed yet
+        end_timestamp: datetime.datetime = (
+            execution_operation.end_timestamp if execution_operation else None
+        ) or datetime.datetime.now()
+        request_id = context.aws_request_id if context else None
+
+        self.execute_plugins(
+            InvocationEndInfo(
+                request_id=request_id,
+                execution_arn=durable_execution_arn,
+                start_timestamp=start_timestamp,
+                status=output.status,
+                end_timestamp=end_timestamp,
+                error=output.error,
+            )
+        )
+
+        if output.status in [InvocationStatus.SUCCEEDED, InvocationStatus.FAILED]:
+            self.execute_plugins(
+                ExecutionEndInfo(
+                    request_id=request_id,
+                    execution_arn=durable_execution_arn,
+                    start_timestamp=start_timestamp,
+                    status=output.status,
+                    end_timestamp=end_timestamp,
+                    error=output.error,
+                )
+            )
+
+    def on_operation_action(self, operation: Operation | None, update: OperationUpdate):
+        """Execute any registered plugins for a given operation before it is updated.
+
+        Args:
+            update: the operation update that is pending checkpoint
+        """
+        if update.action is not OperationAction.START:
+            return
+
+        self.execute_plugins(
+            OperationStartInfo(
+                operation_id=update.operation_id,
+                operation_type=update.operation_type,
+                sub_type=update.sub_type,
+                name=update.name,
+                parent_id=update.parent_id,
+                start_timestamp=datetime.datetime.now(),
+            )
+        )
+
+        if update.operation_type is OperationType.STEP:
+            attempt = (
+                operation.step_details.attempt
+                if operation and operation.step_details
+                else 1
+            )
+            self.execute_plugins(
+                AttemptStartInfo(
+                    operation_id=update.operation_id,
+                    operation_type=update.operation_type,
+                    sub_type=update.sub_type,
+                    name=update.name,
+                    parent_id=update.parent_id,
+                    start_timestamp=datetime.datetime.now(),
+                    attempt=attempt,
+                )
+            )
+
+    def on_operation_update(self, operation):
+        """Execute any registered plugins for a given operation after it is updated.
+
+        Updates such as STARTED might be omitted because START and completion action (e.g. SUCCEED/FAIL) may be
+        checkpointed in batch and the backend returns only the terminal status (e.g. SUCCEEDED/PENDING/FAILED).
+
+        Args:
+            operation: the operation is just checkpointed
+        """
+        params = dict(
+            operation_id=operation.operation_id,
+            operation_type=operation.operation_type,
+            sub_type=operation.sub_type,
+            name=operation.name,
+            parent_id=operation.parent_id,
+            start_timestamp=operation.start_timestamp,
+        )
+        if operation.step_details and (
+            self._is_terminal_status(operation.status)
+            # PENDING in addition to terminal status
+            or operation.status is OperationStatus.PENDING
+        ):
+            self.execute_plugins(
+                AttemptEndInfo(
+                    **params,
+                    end_timestamp=operation.end_timestamp,
+                    attempt=operation.step_details.attempt,
+                    succeeded=operation.status is OperationStatus.SUCCEEDED,
+                    error=operation.step_details.error,
+                )
+            )
+
+        if self._is_terminal_status(operation.status):
+            attempt = operation.step_details.attempt if operation.step_details else None
+            self.execute_plugins(
+                OperationEndInfo(
+                    **params,
+                    end_timestamp=operation.end_timestamp,
+                    status=operation.status,
+                    error=self._extract_error(operation),
+                    attempt=attempt,
+                )
+            )
+
+    @staticmethod
+    def _extract_error(operation: Operation):
+        if operation.step_details and operation.step_details.error:
+            return operation.step_details.error
+        if operation.callback_details and operation.callback_details.error:
+            return operation.callback_details.error
+        if operation.chained_invoke_details and operation.chained_invoke_details.error:
+            return operation.chained_invoke_details.error
+        if operation.context_details and operation.context_details.error:
+            return operation.context_details.error
+        return None
+
+    @staticmethod
+    def _is_terminal_status(status):
+        return status in [
+            OperationStatus.SUCCEEDED,
+            OperationStatus.FAILED,
+            OperationStatus.TIMED_OUT,
+            OperationStatus.CANCELLED,
+            OperationStatus.STOPPED,
+        ]