From 4e7f136c3b893c13decec74b1cd8f36b0d05659b Mon Sep 17 00:00:00 2001 From: Offending Commit Date: Fri, 3 Apr 2026 17:48:52 -0500 Subject: [PATCH 1/9] fix(dreamer): use correct filter syntax for surprisal level query The surprisal observation fetch passed a list directly as the filter value ({"level": [...]}), which generated invalid SQL (level = ARRAY) instead of level IN (...). Use the {"in": [...]} operator syntax. --- src/dreamer/surprisal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dreamer/surprisal.py b/src/dreamer/surprisal.py index faaf7c7e5..a78f94be0 100644 --- a/src/dreamer/surprisal.py +++ b/src/dreamer/surprisal.py @@ -236,7 +236,7 @@ async def _fetch_recent_observations( workspace_name=workspace_name, observer=observer, observed=observed, - filters={"level": levels} if levels else None, + filters={"level": {"in": levels}} if levels else None, limit=limit, ) From 2b6035c9d3beb154a10d9dd3fd46f5d546305dd3 Mon Sep 17 00:00:00 2001 From: Offending Commit Date: Sat, 4 Apr 2026 16:07:54 -0500 Subject: [PATCH 2/9] feat: add local LM Studio support, Prometheus metrics, Traefik proxy, and Grafana dashboards Add Docker infrastructure for local development with LM Studio as LLM provider, Prometheus metrics collection with custom histograms, Traefik reverse proxy configuration, and Grafana dashboard provisioning. Update SDK session handling and deriver queue management for improved reliability. --- .gitignore | 2 + CLAUDE.md | 31 + Dockerfile | 3 + docker-compose.yml.example | 56 +- docker/entrypoint.sh | 4 +- docker/grafana-dashboards.yml | 11 + .../grafana-dashboards/honcho-overview.json | 628 ++++++++++ .../honcho-queue-health.json | 1116 +++++++++++++++++ docker/grafana-datasource.yml | 1 + docker/traefik/dynamic.yml | 77 ++ sdks/python/src/honcho/aio.py | 42 +- sdks/python/src/honcho/session.py | 42 +- src/config.py | 1 + src/deriver/enqueue.py | 14 + src/deriver/queue_manager.py | 384 +++++- src/embedding_client.py | 2 +- src/main.py | 9 + src/routers/messages.py | 2 + src/routers/sessions.py | 12 + src/telemetry/prometheus/metrics.py | 310 ++++- tests/sdk/test_session.py | 77 ++ traces.jsonl | 10 + 22 files changed, 2794 insertions(+), 40 deletions(-) create mode 100644 docker/grafana-dashboards.yml create mode 100644 docker/grafana-dashboards/honcho-overview.json create mode 100644 docker/grafana-dashboards/honcho-queue-health.json create mode 100644 docker/traefik/dynamic.yml create mode 100644 traces.jsonl diff --git a/.gitignore b/.gitignore index 9a31fc383..79c62f464 100644 --- a/.gitignore +++ b/.gitignore @@ -191,3 +191,5 @@ metrics.jsonl AGENTS.md lancedb_data/ grafana-data/ +.codex_honcho_runtime/ +.codex_honcho_setup/ \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index 7ace35299..8190dc96f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -84,6 +84,37 @@ All API routes follow the pattern: `/v1/{resource}/{id}/{action}` - Typechecking: `uv run basedpyright` - Format code: `uv run ruff format src/` +### Local LM Studio Setup + +- Honcho can use LM Studio for generation through the `custom` provider path. +- Keep `LLM_OPENAI_API_KEY` configured for embeddings unless embedding support is added for local models. +- For Docker Compose, `LLM_OPENAI_COMPATIBLE_BASE_URL` must be `http://host.docker.internal:1234/v1`, not `http://localhost:1234/v1`. +- `LLM_OPENAI_COMPATIBLE_API_KEY=lm-studio` is sufficient for local use. +- Current local default model is `qwen2.5-14b-instruct`. +- When overriding `DIALECTIC_LEVELS__*` via env vars, each level needs its full required settings, not just `PROVIDER` and `MODEL`. Include `THINKING_BUDGET_TOKENS` and `MAX_TOOL_ITERATIONS`, and optionally `MAX_OUTPUT_TOKENS`. +- Docker should own the runtime environment completely. Do not mount the repo onto `/app` and do not mount a named volume onto `/app/.venv`, or the image-built environment can be hidden and replaced with incompatible artifacts. +- If Docker services fail with missing Python modules or incompatible native extensions, rebuild the image instead of trying to repair the environment in-place: + +```bash +docker compose build --no-cache api deriver +docker compose up -d --force-recreate api deriver +``` + +- Verify LM Studio from the host with: + +```bash +curl -sS http://localhost:1234/v1/models +``` + +- Verify LM Studio from Docker with: + +```bash +docker compose run --rm --entrypoint sh api -lc 'python - <<\"PY\" +import urllib.request +print(urllib.request.urlopen(\"http://host.docker.internal:1234/v1/models\", timeout=5).status) +PY' +``` + ### SDK Testing #### TypeScript SDK diff --git a/Dockerfile b/Dockerfile index 4a68d6171..df0e5d74e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,10 +41,13 @@ RUN addgroup --system app && adduser --system --group app && mkdir -p /tmp/uv-ca COPY --chown=app:app src/ /app/src/ COPY --chown=app:app migrations/ /app/migrations/ COPY --chown=app:app scripts/ /app/scripts/ +COPY --chown=app:app docker/entrypoint.sh /app/docker/entrypoint.sh COPY --chown=app:app alembic.ini /app/alembic.ini # Copy config files - this will copy config.toml if it exists, and config.toml.example COPY --chown=app:app config.toml* /app/ +RUN chmod +x /app/docker/entrypoint.sh + # Switch to non-root user USER app diff --git a/docker-compose.yml.example b/docker-compose.yml.example index 8bb8507fd..8abadda2c 100644 --- a/docker-compose.yml.example +++ b/docker-compose.yml.example @@ -1,39 +1,55 @@ services: + traefik: + image: traefik:v3.2 + command: + - --api.dashboard=true + - --api.insecure=true + - --providers.file.filename=/etc/traefik/dynamic.yml + - --providers.file.watch=true + - --entrypoints.web.address=:8000 + - --ping=true + ports: + - ${HONCHO_HTTP_PORT:-8000}:8000 + - ${TRAEFIK_DASHBOARD_PORT:-8080}:8080 + volumes: + - ./docker/traefik/dynamic.yml:/etc/traefik/dynamic.yml:ro + networks: + - honcho + api: image: honcho:latest build: context: . dockerfile: Dockerfile - entrypoint: ["sh", "docker/entrypoint.sh"] + entrypoint: ["/app/docker/entrypoint.sh"] depends_on: database: condition: service_healthy redis: condition: service_healthy - ports: - - 8000:8000 - volumes: - - .:/app - - venv:/app/.venv + traefik: + condition: service_started + expose: + - 8000 environment: - DB_CONNECTION_URI=postgresql+psycopg://postgres:postgres@database:5432/postgres - CACHE_URL=redis://redis:6379/0?suppress=true env_file: - path: .env required: false + networks: + - honcho + deriver: build: context: . dockerfile: Dockerfile - entrypoint: ["/app/.venv/bin/python", "-m", "src.deriver"] + entrypoint: ["python", "-m", "src.deriver"] depends_on: database: condition: service_healthy redis: condition: service_healthy - volumes: - - .:/app - - venv:/app/.venv environment: - DB_CONNECTION_URI=postgresql+psycopg://postgres:postgres@database:5432/postgres - CACHE_URL=redis://redis:6379/0?suppress=true @@ -41,6 +57,9 @@ services: env_file: - path: .env required: false + networks: + - honcho + database: image: pgvector/pgvector:pg15 restart: always @@ -56,11 +75,14 @@ services: volumes: - ./database/init.sql:/docker-entrypoint-initdb.d/init.sql - pgdata:/var/lib/postgresql/data/ + networks: + - honcho healthcheck: test: ["CMD-SHELL", "pg_isready -U postgres -d postgres"] interval: 5s timeout: 5s retries: 5 + redis: image: redis:8.2 restart: always @@ -68,11 +90,14 @@ services: - 6379:6379 volumes: - ./redis-data:/data + networks: + - honcho healthcheck: test: ["CMD-SHELL", "redis-cli ping"] interval: 5s timeout: 5s retries: 5 + prometheus: image: prom/prometheus:v3.2.1 ports: @@ -83,6 +108,9 @@ services: depends_on: api: condition: service_started + networks: + - honcho + grafana: image: grafana/grafana:11.4.0 ports: @@ -98,7 +126,13 @@ services: depends_on: prometheus: condition: service_started + networks: + - honcho + +networks: + honcho: + name: honcho + volumes: pgdata: - venv: prometheus-data: diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index bc8e3f379..3c4efcc62 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -2,7 +2,7 @@ set -e echo "Running database migrations..." -/app/.venv/bin/python scripts/provision_db.py +python scripts/provision_db.py echo "Starting API server..." -exec /app/.venv/bin/fastapi run --host 0.0.0.0 src/main.py +exec fastapi run --host 0.0.0.0 src/main.py diff --git a/docker/grafana-dashboards.yml b/docker/grafana-dashboards.yml new file mode 100644 index 000000000..c3be2c6a0 --- /dev/null +++ b/docker/grafana-dashboards.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: Honcho + orgId: 1 + folder: Honcho + type: file + disableDeletion: false + editable: true + options: + path: /etc/grafana/provisioning/dashboards/files diff --git a/docker/grafana-dashboards/honcho-overview.json b/docker/grafana-dashboards/honcho-overview.json new file mode 100644 index 000000000..abf3e1f9f --- /dev/null +++ b/docker/grafana-dashboards/honcho-overview.json @@ -0,0 +1,628 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(api_requests_total{namespace=\"honcho\"}[5m]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "API Request Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(deriver_queue_items_processed_total{namespace=\"honcho\"}[5m]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Deriver Queue Throughput", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(dialectic_calls_total{namespace=\"honcho\"}[5m]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Dialectic Calls", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(messages_created_total{namespace=\"honcho\"}[5m]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Messages Created", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 14, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (status_code) (rate(api_requests_total{namespace=\"honcho\"}[5m]))", + "legendFormat": "{{status_code}}", + "range": true, + "refId": "A" + } + ], + "title": "API Requests by Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 14, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (task_type) (rate(deriver_queue_items_processed_total{namespace=\"honcho\"}[5m]))", + "legendFormat": "{{task_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Deriver Queue Items by Task Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 14, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (token_type, task_type) (rate(deriver_tokens_processed_total{namespace=\"honcho\"}[5m]))", + "legendFormat": "{{task_type}} {{token_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Deriver Tokens", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 14, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (token_type, reasoning_level) (rate(dialectic_tokens_processed_total{namespace=\"honcho\"}[5m]))", + "legendFormat": "{{reasoning_level}} {{token_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Dialectic Tokens", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "honcho" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Honcho Overview", + "uid": "honcho-overview", + "version": 1, + "weekStart": "" +} diff --git a/docker/grafana-dashboards/honcho-queue-health.json b/docker/grafana-dashboards/honcho-queue-health.json new file mode 100644 index 000000000..9d5f519ce --- /dev/null +++ b/docker/grafana-dashboards/honcho-queue-health.json @@ -0,0 +1,1116 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "deriver_active_workers{namespace=\"honcho\"}", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Active Deriver Workers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(deriver_queue_depth{namespace=\"honcho\",state=\"pending\"})", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Pending Queue Items", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(deriver_queue_depth{namespace=\"honcho\",state=\"in_progress\"})", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "In Progress Queue Items", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(deriver_queue_error_backlog{namespace=\"honcho\"})", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Errored Queue Backlog", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 30 + }, + { + "color": "red", + "value": 120 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 7 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "max(deriver_queue_oldest_age_seconds{namespace=\"honcho\",state=\"pending\"})", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Oldest Pending Age", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.25 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 7 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(api_request_duration_seconds_bucket{namespace=\"honcho\"}[5m])) by (le))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "API Request P95", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 14, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 7, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (task_type, state) (deriver_queue_depth{namespace=\"honcho\"})", + "legendFormat": "{{task_type}} {{state}}", + "range": true, + "refId": "A" + } + ], + "title": "Queue Depth by Task Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 14, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 8, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "max by (task_type) (deriver_queue_oldest_age_seconds{namespace=\"honcho\",state=\"pending\"})", + "legendFormat": "{{task_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Oldest Pending Age by Task Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 14, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 9, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(deriver_queue_item_latency_seconds_bucket{namespace=\"honcho\"}[5m])) by (le, task_type, outcome))", + "legendFormat": "p50 {{task_type}} {{outcome}}", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(deriver_queue_item_latency_seconds_bucket{namespace=\"honcho\"}[5m])) by (le, task_type, outcome))", + "legendFormat": "p95 {{task_type}} {{outcome}}", + "range": true, + "refId": "B" + } + ], + "title": "Queue Item Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (task_type) (rate(deriver_queue_errors_total{namespace=\"honcho\"}[5m]))", + "legendFormat": "{{task_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Queue Error Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 34 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(sessions_active{namespace=\"honcho\"})", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Active Sessions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 300 + }, + { + "color": "red", + "value": 1800 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 34 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(deriver_queue_depth{namespace=\"honcho\",state=~\"pending|in_progress\"}) / clamp_min(sum(rate(deriver_queue_items_processed_total{namespace=\"honcho\"}[10m])), 0.001)", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Estimated Queue Drain Time", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 34 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(deriver_queue_items_enqueued_total{namespace=\"honcho\"}[10m]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Queue Arrival Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 34 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(deriver_queue_items_enqueued_total{namespace=\"honcho\"}[10m])) - sum(rate(deriver_queue_items_processed_total{namespace=\"honcho\"}[10m]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Net Queue Pressure", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 14, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 15, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "topk(10, sum by (workspace_name, session_name) (session_queue_depth{namespace=\"honcho\",state=~\"pending|in_progress\"}))", + "legendFormat": "{{workspace_name}} / {{session_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Hot Sessions by Queue Depth", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 14, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 16, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "topk(10, sum by (workspace_name, session_name) (rate(messages_created_total{namespace=\"honcho\"}[15m])))", + "legendFormat": "{{workspace_name}} / {{session_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Hot Sessions by Message Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 14, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 17, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "topk(10, max by (workspace_name, session_name) (session_queue_oldest_age_seconds{namespace=\"honcho\",state=\"pending\"}))", + "legendFormat": "{{workspace_name}} / {{session_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Sessions With Oldest Pending Work", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 14, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 18, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "topk(10, sum by (workspace_name, session_name) (rate(session_context_requests_total{namespace=\"honcho\"}[15m])))", + "legendFormat": "context {{workspace_name}} / {{session_name}}", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "topk(10, sum by (workspace_name, session_name) (rate(session_search_requests_total{namespace=\"honcho\"}[15m])))", + "legendFormat": "search {{workspace_name}} / {{session_name}}", + "range": true, + "refId": "B" + } + ], + "title": "Session Context and Search Request Rate", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "honcho", + "queue" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Honcho Queue Health", + "uid": "honcho-queue-health", + "version": 2, + "weekStart": "" +} diff --git a/docker/grafana-datasource.yml b/docker/grafana-datasource.yml index bb009bb21..00f99157b 100644 --- a/docker/grafana-datasource.yml +++ b/docker/grafana-datasource.yml @@ -2,6 +2,7 @@ apiVersion: 1 datasources: - name: Prometheus + uid: prometheus type: prometheus access: proxy url: http://prometheus:9090 diff --git a/docker/traefik/dynamic.yml b/docker/traefik/dynamic.yml new file mode 100644 index 000000000..add57262e --- /dev/null +++ b/docker/traefik/dynamic.yml @@ -0,0 +1,77 @@ +http: + routers: + honcho-health: + rule: Path(`/health`) + entryPoints: + - web + service: ping@internal + + honcho-api-double-v3: + rule: PathPrefix(`/v3/v3`) + entryPoints: + - web + priority: 120 + middlewares: + - honcho-strip-extra-v3 + service: honcho-api + + honcho-api-v3-docs: + rule: Path(`/v3/docs`) || Path(`/v3/openapi.json`) + entryPoints: + - web + priority: 110 + middlewares: + - honcho-v3-docs + service: honcho-api + + honcho-api: + rule: PathPrefix(`/v3`) || Path(`/docs`) || Path(`/openapi.json`) || Path(`/metrics`) + entryPoints: + - web + priority: 100 + service: honcho-api + + honcho-api-root-fallback: + rule: PathPrefix(`/workspaces`) || PathPrefix(`/keys`) + entryPoints: + - web + priority: 90 + middlewares: + - honcho-add-v3 + service: honcho-api + + honcho-api-root: + rule: Path(`/`) + entryPoints: + - web + priority: 80 + middlewares: + - honcho-root-docs + service: honcho-api + + middlewares: + honcho-v3-docs: + replacePathRegex: + regex: ^/v3/(docs|openapi\.json)$ + replacement: /$1 + + honcho-strip-extra-v3: + replacePathRegex: + regex: ^/v3/v3(?:/(.*))?$ + replacement: /v3/$1 + + honcho-add-v3: + replacePathRegex: + regex: ^/(workspaces|keys)(.*)$ + replacement: /v3/$1$2 + + honcho-root-docs: + replacePathRegex: + regex: ^/$ + replacement: /docs + + services: + honcho-api: + loadBalancer: + servers: + - url: http://api:8000 diff --git a/sdks/python/src/honcho/aio.py b/sdks/python/src/honcho/aio.py index 0edcc60f4..07aaf0128 100644 --- a/sdks/python/src/honcho/aio.py +++ b/sdks/python/src/honcho/aio.py @@ -68,7 +68,7 @@ from .conclusions import ConclusionCreateParams from .peer import Peer -from .session import Session +from .session import MAX_MESSAGES_PER_BATCH, Session logger = logging.getLogger(__name__) @@ -1006,18 +1006,36 @@ async def add_messages( if not isinstance(messages, list): messages = [messages] - messages_data = [ - msg.model_dump(mode="json", exclude_none=True) for msg in messages - ] + created_messages: list[Message] = [] + total_messages = len(messages) + + for batch_start in range(0, total_messages, MAX_MESSAGES_PER_BATCH): + batch = messages[batch_start : batch_start + MAX_MESSAGES_PER_BATCH] + logger.info( + "Sending Honcho message batch", + extra={ + "session_id": self._session.id, + "workspace_id": self._session.workspace_id, + "batch_size": len(batch), + "batch_start": batch_start, + "total_messages": total_messages, + }, + ) - data = await self._session._honcho._async_http_client.post( - routes.messages(self._session.workspace_id, self._session.id), - body={"messages": messages_data}, - ) - return [ - Message.from_api_response(MessageResponse.model_validate(msg)) - for msg in data - ] + messages_data = [ + msg.model_dump(mode="json", exclude_none=True) for msg in batch + ] + + data = await self._session._honcho._async_http_client.post( + routes.messages(self._session.workspace_id, self._session.id), + body={"messages": messages_data}, + ) + created_messages.extend( + Message.from_api_response(MessageResponse.model_validate(msg)) + for msg in data + ) + + return created_messages async def messages( self, diff --git a/sdks/python/src/honcho/session.py b/sdks/python/src/honcho/session.py index 8a91d7885..38824a251 100644 --- a/sdks/python/src/honcho/session.py +++ b/sdks/python/src/honcho/session.py @@ -42,6 +42,8 @@ __all__ = ["Session", "SessionPeerConfig"] +MAX_MESSAGES_PER_BATCH = 100 + class Session(SessionBase, MetadataConfigMixin): """ @@ -427,18 +429,36 @@ def add_messages( if not isinstance(messages, list): messages = [messages] - messages_data = [ - msg.model_dump(mode="json", exclude_none=True) for msg in messages - ] + created_messages: list[Message] = [] + total_messages = len(messages) + + for batch_start in range(0, total_messages, MAX_MESSAGES_PER_BATCH): + batch = messages[batch_start : batch_start + MAX_MESSAGES_PER_BATCH] + logger.info( + "Sending Honcho message batch", + extra={ + "session_id": self.id, + "workspace_id": self.workspace_id, + "batch_size": len(batch), + "batch_start": batch_start, + "total_messages": total_messages, + }, + ) - data = self._honcho._http.post( - routes.messages(self.workspace_id, self.id), - body={"messages": messages_data}, - ) - return [ - Message.from_api_response(MessageResponse.model_validate(msg)) - for msg in data - ] + messages_data = [ + msg.model_dump(mode="json", exclude_none=True) for msg in batch + ] + + data = self._honcho._http.post( + routes.messages(self.workspace_id, self.id), + body={"messages": messages_data}, + ) + created_messages.extend( + Message.from_api_response(MessageResponse.model_validate(msg)) + for msg in data + ) + + return created_messages @validate_call def messages( diff --git a/src/config.py b/src/config.py index f01c451b1..9663aed01 100644 --- a/src/config.py +++ b/src/config.py @@ -214,6 +214,7 @@ class LLMSettings(HonchoSettings): VLLM_BASE_URL: str | None = None EMBEDDING_PROVIDER: Literal["openai", "gemini", "openrouter"] = "openai" + EMBEDDING_MODEL: str | None = None # General LLM settings DEFAULT_MAX_TOKENS: Annotated[int, Field(default=1000, gt=0, le=100_000)] = 2500 diff --git a/src/deriver/enqueue.py b/src/deriver/enqueue.py index 71b00a2ba..1cc7e084a 100644 --- a/src/deriver/enqueue.py +++ b/src/deriver/enqueue.py @@ -12,6 +12,7 @@ from src.exceptions import ValidationException from src.models import QueueItem from src.schemas import MessageConfiguration, ResolvedConfiguration +from src.telemetry import prometheus_metrics from src.utils.config_helpers import get_configuration from src.utils.queue_payload import ( create_deletion_payload, @@ -70,6 +71,19 @@ async def enqueue(payload: list[dict[str, Any]]) -> None: stmt = insert(QueueItem).returning(QueueItem) await db_session.execute(stmt, queue_records) await db_session.commit() + if settings.METRICS.ENABLED: + enqueued_by_task_type: dict[str, int] = {} + for queue_record in queue_records: + task_type = queue_record["task_type"] + enqueued_by_task_type[task_type] = ( + enqueued_by_task_type.get(task_type, 0) + 1 + ) + for task_type, count in enqueued_by_task_type.items(): + prometheus_metrics.record_deriver_queue_item_enqueued( + count=count, + workspace_name=workspace_name, + task_type=task_type, + ) except Exception as e: logger.exception("Failed to enqueue message(s)!") diff --git a/src/deriver/queue_manager.py b/src/deriver/queue_manager.py index 5e885255e..d9b3c2b58 100644 --- a/src/deriver/queue_manager.py +++ b/src/deriver/queue_manager.py @@ -1,5 +1,6 @@ import asyncio import signal +import time from asyncio import Task from collections.abc import Sequence from datetime import datetime, timedelta, timezone @@ -10,7 +11,7 @@ from dotenv import load_dotenv from nanoid import generate as generate_nanoid from sentry_sdk.integrations.asyncio import AsyncioIntegration -from sqlalchemy import and_, delete, or_, select, update +from sqlalchemy import and_, case, delete, or_, select, update from sqlalchemy.dialects.postgresql import insert from sqlalchemy.engine import CursorResult from sqlalchemy.ext.asyncio import AsyncSession @@ -62,6 +63,15 @@ def __init__(self): self.active_tasks: set[asyncio.Task[None]] = set() self.worker_ownership: dict[str, WorkerOwnership] = {} self.queue_empty_flag: asyncio.Event = asyncio.Event() + self.last_queue_metrics_refresh_at: float = 0.0 + self.queue_metrics_refresh_interval_seconds: float = 5.0 + self.seen_queue_depth_labels: set[tuple[str, str, str]] = set() + self.seen_queue_oldest_age_labels: set[tuple[str, str, str]] = set() + self.seen_queue_error_backlog_labels: set[tuple[str, str]] = set() + self.seen_sessions_active_workspaces: set[str] = set() + self.seen_session_last_message_labels: set[tuple[str, str]] = set() + self.seen_session_queue_depth_labels: set[tuple[str, str, str]] = set() + self.seen_session_queue_oldest_age_labels: set[tuple[str, str, str]] = set() # Initialize from settings self.workers: int = settings.DERIVER.WORKERS @@ -321,6 +331,11 @@ async def polling_loop(self) -> None: logger.debug("Starting polling loop") try: while not self.shutdown_event.is_set(): + if settings.METRICS.ENABLED: + prometheus_metrics.set_deriver_active_workers( + count=self.get_total_owned_work_units() + ) + if self.queue_empty_flag.is_set(): # logger.debug("Queue empty flag set, waiting") await asyncio.sleep(settings.DERIVER.POLLING_SLEEP_INTERVAL_SECONDS) @@ -335,6 +350,7 @@ async def polling_loop(self) -> None: try: await self.cleanup_stale_work_units() + await self.refresh_queue_health_metrics() claimed_work_units = await self.get_and_claim_work_units() if claimed_work_units: for work_unit_key, aqs_id in claimed_work_units.items(): @@ -364,6 +380,336 @@ async def polling_loop(self) -> None: finally: logger.info("Polling loop stopped") + async def refresh_queue_health_metrics(self) -> None: + """Refresh queue health gauges on a short interval.""" + now = time.monotonic() + if ( + now - self.last_queue_metrics_refresh_at + < self.queue_metrics_refresh_interval_seconds + ): + return + + self.last_queue_metrics_refresh_at = now + + async with tracked_db("refresh_queue_health_metrics") as db: + is_in_progress = (~models.QueueItem.processed) & ( + models.ActiveQueueSession.id.isnot(None) + ) + is_pending = (~models.QueueItem.processed) & ( + models.ActiveQueueSession.id.is_(None) + ) + + stmt = ( + select( + models.QueueItem.workspace_name, + models.QueueItem.task_type, + func.count(case((is_pending, 1))).label("pending_count"), + func.count(case((is_in_progress, 1))).label("in_progress_count"), + func.count(case((models.QueueItem.error.isnot(None), 1))).label( + "error_backlog_count" + ), + func.min(case((is_pending, models.QueueItem.created_at))).label( + "oldest_pending_at" + ), + func.min(case((is_in_progress, models.QueueItem.created_at))).label( + "oldest_in_progress_at" + ), + ) + .select_from(models.QueueItem) + .outerjoin( + models.ActiveQueueSession, + models.QueueItem.work_unit_key + == models.ActiveQueueSession.work_unit_key, + ) + .where(models.QueueItem.workspace_name.isnot(None)) + .where( + models.QueueItem.task_type.in_(("representation", "summary", "dream")) + ) + .group_by(models.QueueItem.workspace_name, models.QueueItem.task_type) + ) + + result = await db.execute(stmt) + rows = result.all() + await db.commit() + + current_depth_labels: set[tuple[str, str, str]] = set() + current_oldest_age_labels: set[tuple[str, str, str]] = set() + current_error_backlog_labels: set[tuple[str, str]] = set() + current_sessions_active_workspaces: set[str] = set() + current_session_last_message_labels: set[tuple[str, str]] = set() + current_session_queue_depth_labels: set[tuple[str, str, str]] = set() + current_session_queue_oldest_age_labels: set[tuple[str, str, str]] = set() + now_utc = datetime.now(timezone.utc) + + for row in rows: + workspace_name = row.workspace_name + task_type = row.task_type + if workspace_name is None: + continue + + pending_count = int(row.pending_count or 0) + in_progress_count = int(row.in_progress_count or 0) + error_backlog_count = int(row.error_backlog_count or 0) + + for state, count in ( + ("pending", pending_count), + ("in_progress", in_progress_count), + ): + label = (workspace_name, task_type, state) + current_depth_labels.add(label) + prometheus_metrics.set_deriver_queue_depth( + workspace_name=workspace_name, + task_type=task_type, + state=state, + count=count, + ) + + if row.oldest_pending_at is not None: + current_oldest_age_labels.add((workspace_name, task_type, "pending")) + prometheus_metrics.set_deriver_queue_oldest_age( + workspace_name=workspace_name, + task_type=task_type, + state="pending", + age_seconds=max( + 0.0, (now_utc - row.oldest_pending_at).total_seconds() + ), + ) + + if row.oldest_in_progress_at is not None: + current_oldest_age_labels.add( + (workspace_name, task_type, "in_progress") + ) + prometheus_metrics.set_deriver_queue_oldest_age( + workspace_name=workspace_name, + task_type=task_type, + state="in_progress", + age_seconds=max( + 0.0, (now_utc - row.oldest_in_progress_at).total_seconds() + ), + ) + + backlog_label = (workspace_name, task_type) + current_error_backlog_labels.add(backlog_label) + prometheus_metrics.set_deriver_queue_error_backlog( + workspace_name=workspace_name, + task_type=task_type, + count=error_backlog_count, + ) + + async with tracked_db("refresh_session_metrics") as db: + active_sessions_stmt = ( + select( + models.Session.workspace_name, + func.count(models.Session.id).label("active_session_count"), + ) + .where(models.Session.is_active == True) # noqa: E712 + .group_by(models.Session.workspace_name) + ) + active_sessions_rows = (await db.execute(active_sessions_stmt)).all() + + session_activity_stmt = ( + select( + models.Session.workspace_name, + models.Session.name, + func.max(models.Message.created_at).label("last_message_at"), + models.Session.created_at.label("session_created_at"), + ) + .select_from(models.Session) + .outerjoin( + models.Message, + and_( + models.Message.workspace_name == models.Session.workspace_name, + models.Message.session_name == models.Session.name, + ), + ) + .where(models.Session.is_active == True) # noqa: E712 + .group_by( + models.Session.workspace_name, + models.Session.name, + models.Session.created_at, + ) + ) + session_activity_rows = (await db.execute(session_activity_stmt)).all() + + session_queue_stmt = ( + select( + models.QueueItem.workspace_name, + models.Session.name.label("session_name"), + func.count(case((is_pending, 1))).label("pending_count"), + func.count(case((is_in_progress, 1))).label("in_progress_count"), + func.min(case((is_pending, models.QueueItem.created_at))).label( + "oldest_pending_at" + ), + func.min(case((is_in_progress, models.QueueItem.created_at))).label( + "oldest_in_progress_at" + ), + ) + .select_from(models.QueueItem) + .join(models.Session, models.QueueItem.session_id == models.Session.id) + .outerjoin( + models.ActiveQueueSession, + models.QueueItem.work_unit_key + == models.ActiveQueueSession.work_unit_key, + ) + .where(models.QueueItem.workspace_name.isnot(None)) + .where(models.Session.is_active == True) # noqa: E712 + .where(~models.QueueItem.processed) + .where( + models.QueueItem.task_type.in_(("representation", "summary", "dream")) + ) + .group_by(models.QueueItem.workspace_name, models.Session.name) + ) + session_queue_rows = (await db.execute(session_queue_stmt)).all() + await db.commit() + + for row in active_sessions_rows: + workspace_name = row.workspace_name + if workspace_name is None: + continue + current_sessions_active_workspaces.add(workspace_name) + prometheus_metrics.set_sessions_active( + workspace_name=workspace_name, + count=int(row.active_session_count or 0), + ) + + for row in session_activity_rows: + workspace_name = row.workspace_name + session_name = row.name + if workspace_name is None or session_name is None: + continue + current_session_last_message_labels.add((workspace_name, session_name)) + last_activity_at = row.last_message_at or row.session_created_at + prometheus_metrics.set_session_last_message_age( + workspace_name=workspace_name, + session_name=session_name, + age_seconds=max(0.0, (now_utc - last_activity_at).total_seconds()), + ) + + for row in session_queue_rows: + workspace_name = row.workspace_name + session_name = row.session_name + if workspace_name is None or session_name is None: + continue + + pending_count = int(row.pending_count or 0) + in_progress_count = int(row.in_progress_count or 0) + + for state, count in ( + ("pending", pending_count), + ("in_progress", in_progress_count), + ): + label = (workspace_name, session_name, state) + current_session_queue_depth_labels.add(label) + prometheus_metrics.set_session_queue_depth( + workspace_name=workspace_name, + session_name=session_name, + state=state, + count=count, + ) + + if row.oldest_pending_at is not None: + current_session_queue_oldest_age_labels.add( + (workspace_name, session_name, "pending") + ) + prometheus_metrics.set_session_queue_oldest_age( + workspace_name=workspace_name, + session_name=session_name, + state="pending", + age_seconds=max( + 0.0, (now_utc - row.oldest_pending_at).total_seconds() + ), + ) + + if row.oldest_in_progress_at is not None: + current_session_queue_oldest_age_labels.add( + (workspace_name, session_name, "in_progress") + ) + prometheus_metrics.set_session_queue_oldest_age( + workspace_name=workspace_name, + session_name=session_name, + state="in_progress", + age_seconds=max( + 0.0, (now_utc - row.oldest_in_progress_at).total_seconds() + ), + ) + + for workspace_name, task_type, state in ( + self.seen_queue_depth_labels - current_depth_labels + ): + prometheus_metrics.set_deriver_queue_depth( + workspace_name=workspace_name, + task_type=task_type, + state=state, + count=0, + ) + + for workspace_name, task_type, state in ( + self.seen_queue_oldest_age_labels - current_oldest_age_labels + ): + prometheus_metrics.set_deriver_queue_oldest_age( + workspace_name=workspace_name, + task_type=task_type, + state=state, + age_seconds=0.0, + ) + + for workspace_name, task_type in ( + self.seen_queue_error_backlog_labels - current_error_backlog_labels + ): + prometheus_metrics.set_deriver_queue_error_backlog( + workspace_name=workspace_name, + task_type=task_type, + count=0, + ) + + for workspace_name in ( + self.seen_sessions_active_workspaces - current_sessions_active_workspaces + ): + prometheus_metrics.set_sessions_active( + workspace_name=workspace_name, + count=0, + ) + + for workspace_name, session_name in ( + self.seen_session_last_message_labels - current_session_last_message_labels + ): + prometheus_metrics.set_session_last_message_age( + workspace_name=workspace_name, + session_name=session_name, + age_seconds=0.0, + ) + + for workspace_name, session_name, state in ( + self.seen_session_queue_depth_labels - current_session_queue_depth_labels + ): + prometheus_metrics.set_session_queue_depth( + workspace_name=workspace_name, + session_name=session_name, + state=state, + count=0, + ) + + for workspace_name, session_name, state in ( + self.seen_session_queue_oldest_age_labels + - current_session_queue_oldest_age_labels + ): + prometheus_metrics.set_session_queue_oldest_age( + workspace_name=workspace_name, + session_name=session_name, + state=state, + age_seconds=0.0, + ) + + self.seen_queue_depth_labels = current_depth_labels + self.seen_queue_oldest_age_labels = current_oldest_age_labels + self.seen_queue_error_backlog_labels = current_error_backlog_labels + self.seen_sessions_active_workspaces = current_sessions_active_workspaces + self.seen_session_last_message_labels = current_session_last_message_labels + self.seen_session_queue_depth_labels = current_session_queue_depth_labels + self.seen_session_queue_oldest_age_labels = ( + current_session_queue_oldest_age_labels + ) + ###################### # Queue Worker Logic # ###################### @@ -794,7 +1140,7 @@ async def mark_queue_items_as_processed( await db.commit() if ( - work_unit.task_type in ["representation", "summary"] + work_unit.task_type in ["representation", "summary", "dream"] and work_unit.workspace_name is not None and settings.METRICS.ENABLED ): @@ -803,6 +1149,17 @@ async def mark_queue_items_as_processed( workspace_name=work_unit.workspace_name, task_type=work_unit.task_type, ) + now_utc = datetime.now(timezone.utc) + for item in items: + if item.created_at is not None: + prometheus_metrics.observe_deriver_queue_item_latency( + workspace_name=work_unit.workspace_name, + task_type=work_unit.task_type, + outcome="processed", + latency_seconds=max( + 0.0, (now_utc - item.created_at).total_seconds() + ), + ) async def mark_queue_item_as_errored( self, item: QueueItem, work_unit_key: str, error: str @@ -824,6 +1181,29 @@ async def mark_queue_item_as_errored( ) await db.commit() + work_unit = parse_work_unit_key(work_unit_key) + if ( + settings.METRICS.ENABLED + and work_unit.workspace_name is not None + and work_unit.task_type in ["representation", "summary", "dream"] + ): + prometheus_metrics.record_deriver_queue_error( + workspace_name=work_unit.workspace_name, + task_type=work_unit.task_type, + ) + if item.created_at is not None: + prometheus_metrics.observe_deriver_queue_item_latency( + workspace_name=work_unit.workspace_name, + task_type=work_unit.task_type, + outcome="errored", + latency_seconds=max( + 0.0, + ( + datetime.now(timezone.utc) - item.created_at + ).total_seconds(), + ), + ) + async def _cleanup_work_unit( self, aqs_id: str, diff --git a/src/embedding_client.py b/src/embedding_client.py index 9798dc02b..eff924a75 100644 --- a/src/embedding_client.py +++ b/src/embedding_client.py @@ -52,7 +52,7 @@ def __init__(self, api_key: str | None = None, provider: str | None = None): or "https://openrouter.ai/api/v1" ) self.client = AsyncOpenAI(api_key=api_key, base_url=base_url) - self.model = "openai/text-embedding-3-small" + self.model = settings.LLM.EMBEDDING_MODEL or "openai/text-embedding-3-small" self.max_embedding_tokens = settings.MAX_EMBEDDING_TOKENS self.max_batch_size = 2048 # Same as OpenAI else: # openai diff --git a/src/main.py b/src/main.py index db0533181..0c6aa6039 100644 --- a/src/main.py +++ b/src/main.py @@ -1,6 +1,7 @@ import logging import re import uuid +import time from collections.abc import Awaitable, Callable from contextlib import asynccontextmanager from typing import TYPE_CHECKING @@ -232,6 +233,7 @@ async def track_request( # Store in request state and context var request.state.request_id = request_id token = request_context.set(f"api:{request_id}") + started_at = time.perf_counter() try: response = await call_next(request) @@ -239,11 +241,18 @@ async def track_request( # Track metrics if enabled if settings.METRICS.ENABLED: template = get_route_template(request) + duration_seconds = time.perf_counter() - started_at prometheus_metrics.record_api_request( method=request.method, endpoint=template, status_code=str(response.status_code), ) + prometheus_metrics.record_api_request_duration( + method=request.method, + endpoint=template, + status_code=str(response.status_code), + duration_seconds=duration_seconds, + ) return response finally: diff --git a/src/routers/messages.py b/src/routers/messages.py index 3acdc1e5d..5d5d0373e 100644 --- a/src/routers/messages.py +++ b/src/routers/messages.py @@ -105,6 +105,7 @@ async def create_messages_for_session( prometheus_metrics.record_messages_created( count=len(created_messages), workspace_name=workspace_id, + session_name=session_id, ) # Enqueue for processing (existing logic) @@ -204,6 +205,7 @@ async def create_messages_with_file( prometheus_metrics.record_messages_created( count=len(created_messages), workspace_name=workspace_id, + session_name=session_id, ) return created_messages diff --git a/src/routers/sessions.py b/src/routers/sessions.py index f68f93ffd..d74783c54 100644 --- a/src/routers/sessions.py +++ b/src/routers/sessions.py @@ -18,6 +18,7 @@ ValidationException, ) from src.security import JWTParams, require_auth +from src.telemetry import prometheus_metrics from src.utils import summarizer from src.utils.representation import Representation from src.utils.search import search @@ -670,6 +671,12 @@ async def get_session_context( tokens if tokens is not None else config.settings.GET_CONTEXT_MAX_TOKENS ) + if config.settings.METRICS.ENABLED: + prometheus_metrics.record_session_context_request( + workspace_name=workspace_id, + session_name=session_id, + ) + if peer_perspective and not peer_target: raise ValidationException( "peer_target must be provided if peer_perspective is provided" @@ -803,6 +810,11 @@ async def search_session( filters = body.filters or {} filters["workspace_id"] = workspace_id filters["session_id"] = session_id + if config.settings.METRICS.ENABLED: + prometheus_metrics.record_session_search_request( + workspace_name=workspace_id, + session_name=session_id, + ) return await search( db, body.query, diff --git a/src/telemetry/prometheus/metrics.py b/src/telemetry/prometheus/metrics.py index f64399b7b..0c9fc15b6 100644 --- a/src/telemetry/prometheus/metrics.py +++ b/src/telemetry/prometheus/metrics.py @@ -10,6 +10,8 @@ CONTENT_TYPE_LATEST, REGISTRY, Counter, + Gauge, + Histogram, disable_created_metrics, generate_latest, ) @@ -29,6 +31,18 @@ def labels(self, **kwargs: str) -> NamespacedCounter: return super().labels(**kwargs) # type: ignore[return-value] +class NamespacedGauge(Gauge): + def labels(self, **kwargs: str) -> NamespacedGauge: + kwargs["namespace"] = cast(str, settings.METRICS.NAMESPACE) + return super().labels(**kwargs) # type: ignore[return-value] + + +class NamespacedHistogram(Histogram): + def labels(self, **kwargs: str) -> NamespacedHistogram: + kwargs["namespace"] = cast(str, settings.METRICS.NAMESPACE) + return super().labels(**kwargs) # type: ignore[return-value] + + class TokenTypes(Enum): INPUT = "input" OUTPUT = "output" @@ -56,10 +70,29 @@ class DialecticComponents(Enum): ["namespace", "method", "endpoint", "status_code"], ) +api_request_duration_histogram = NamespacedHistogram( + "api_request_duration_seconds", + "API request duration in seconds", + ["namespace", "method", "endpoint", "status_code"], + buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60), +) + messages_created_counter = NamespacedCounter( "messages_created", "Total messages created", - ["namespace", "workspace_name"], + ["namespace", "workspace_name", "session_name"], +) + +session_context_requests_counter = NamespacedCounter( + "session_context_requests", + "Total session context requests", + ["namespace", "workspace_name", "session_name"], +) + +session_search_requests_counter = NamespacedCounter( + "session_search_requests", + "Total session search requests", + ["namespace", "workspace_name", "session_name"], ) dialectic_calls_counter = NamespacedCounter( @@ -74,6 +107,12 @@ class DialecticComponents(Enum): ["namespace", "workspace_name", "task_type"], ) +deriver_queue_items_enqueued_counter = NamespacedCounter( + "deriver_queue_items_enqueued", + "Total deriver queue items enqueued", + ["namespace", "workspace_name", "task_type"], +) + deriver_tokens_processed_counter = NamespacedCounter( "deriver_tokens_processed", "Total tokens processed by the deriver", @@ -92,6 +131,67 @@ class DialecticComponents(Enum): ["namespace", "specialist_name", "token_type"], ) +deriver_active_workers_gauge = NamespacedGauge( + "deriver_active_workers", + "Current number of deriver workers actively processing work units", + ["namespace"], +) + +deriver_queue_depth_gauge = NamespacedGauge( + "deriver_queue_depth", + "Current queue depth by workspace, task type, and state", + ["namespace", "workspace_name", "task_type", "state"], +) + +deriver_queue_oldest_age_gauge = NamespacedGauge( + "deriver_queue_oldest_age_seconds", + "Age in seconds of the oldest queue item by workspace, task type, and state", + ["namespace", "workspace_name", "task_type", "state"], +) + +deriver_queue_error_backlog_gauge = NamespacedGauge( + "deriver_queue_error_backlog", + "Current count of errored queue items retained in the queue table", + ["namespace", "workspace_name", "task_type"], +) + +deriver_queue_errors_counter = NamespacedCounter( + "deriver_queue_errors", + "Total deriver queue item processing errors", + ["namespace", "workspace_name", "task_type"], +) + +deriver_queue_item_latency_histogram = NamespacedHistogram( + "deriver_queue_item_latency_seconds", + "Queue item latency from enqueue to terminal state", + ["namespace", "workspace_name", "task_type", "outcome"], + buckets=(0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300, 600, 1800, 3600), +) + +sessions_active_gauge = NamespacedGauge( + "sessions_active", + "Current number of active sessions by workspace", + ["namespace", "workspace_name"], +) + +session_last_message_age_gauge = NamespacedGauge( + "session_last_message_age_seconds", + "Age in seconds since the last message in an active session", + ["namespace", "workspace_name", "session_name"], +) + +session_queue_depth_gauge = NamespacedGauge( + "session_queue_depth", + "Current queue depth by workspace, session, and state", + ["namespace", "workspace_name", "session_name", "state"], +) + +session_queue_oldest_age_gauge = NamespacedGauge( + "session_queue_oldest_age_seconds", + "Age in seconds of the oldest queue item by workspace, session, and state", + ["namespace", "workspace_name", "session_name", "state"], +) + @final class PrometheusMetrics: @@ -126,19 +226,66 @@ def record_api_request( except Exception as e: self._handle_metric_error("record_api_request", e) + def record_api_request_duration( + self, + *, + method: str, + endpoint: str, + status_code: str, + duration_seconds: float, + ) -> None: + try: + api_request_duration_histogram.labels( + method=method, + endpoint=endpoint, + status_code=status_code, + ).observe(duration_seconds) + except Exception as e: + self._handle_metric_error("record_api_request_duration", e) + def record_messages_created( self, *, count: int, workspace_name: str, + session_name: str, ) -> None: try: messages_created_counter.labels( workspace_name=workspace_name, + session_name=session_name, ).inc(count) except Exception as e: self._handle_metric_error("record_messages_created", e) + def record_session_context_request( + self, + *, + workspace_name: str, + session_name: str, + ) -> None: + try: + session_context_requests_counter.labels( + workspace_name=workspace_name, + session_name=session_name, + ).inc() + except Exception as e: + self._handle_metric_error("record_session_context_request", e) + + def record_session_search_request( + self, + *, + workspace_name: str, + session_name: str, + ) -> None: + try: + session_search_requests_counter.labels( + workspace_name=workspace_name, + session_name=session_name, + ).inc() + except Exception as e: + self._handle_metric_error("record_session_search_request", e) + def record_dialectic_call( self, *, @@ -168,6 +315,21 @@ def record_deriver_queue_item( except Exception as e: self._handle_metric_error("record_deriver_queue_item", e) + def record_deriver_queue_item_enqueued( + self, + *, + count: int, + workspace_name: str, + task_type: str, + ) -> None: + try: + deriver_queue_items_enqueued_counter.labels( + workspace_name=workspace_name, + task_type=task_type, + ).inc(count) + except Exception as e: + self._handle_metric_error("record_deriver_queue_item_enqueued", e) + def record_deriver_tokens( self, *, @@ -217,6 +379,152 @@ def record_dreamer_tokens( except Exception as e: self._handle_metric_error("record_dreamer_tokens", e) + def set_deriver_active_workers(self, *, count: int) -> None: + try: + deriver_active_workers_gauge.labels().set(count) + except Exception as e: + self._handle_metric_error("set_deriver_active_workers", e) + + def set_deriver_queue_depth( + self, + *, + workspace_name: str, + task_type: str, + state: str, + count: int, + ) -> None: + try: + deriver_queue_depth_gauge.labels( + workspace_name=workspace_name, + task_type=task_type, + state=state, + ).set(count) + except Exception as e: + self._handle_metric_error("set_deriver_queue_depth", e) + + def set_deriver_queue_oldest_age( + self, + *, + workspace_name: str, + task_type: str, + state: str, + age_seconds: float, + ) -> None: + try: + deriver_queue_oldest_age_gauge.labels( + workspace_name=workspace_name, + task_type=task_type, + state=state, + ).set(age_seconds) + except Exception as e: + self._handle_metric_error("set_deriver_queue_oldest_age", e) + + def set_deriver_queue_error_backlog( + self, + *, + workspace_name: str, + task_type: str, + count: int, + ) -> None: + try: + deriver_queue_error_backlog_gauge.labels( + workspace_name=workspace_name, + task_type=task_type, + ).set(count) + except Exception as e: + self._handle_metric_error("set_deriver_queue_error_backlog", e) + + def record_deriver_queue_error( + self, + *, + workspace_name: str, + task_type: str, + ) -> None: + try: + deriver_queue_errors_counter.labels( + workspace_name=workspace_name, + task_type=task_type, + ).inc() + except Exception as e: + self._handle_metric_error("record_deriver_queue_error", e) + + def observe_deriver_queue_item_latency( + self, + *, + workspace_name: str, + task_type: str, + outcome: str, + latency_seconds: float, + ) -> None: + try: + deriver_queue_item_latency_histogram.labels( + workspace_name=workspace_name, + task_type=task_type, + outcome=outcome, + ).observe(latency_seconds) + except Exception as e: + self._handle_metric_error("observe_deriver_queue_item_latency", e) + + def set_sessions_active( + self, + *, + workspace_name: str, + count: int, + ) -> None: + try: + sessions_active_gauge.labels(workspace_name=workspace_name).set(count) + except Exception as e: + self._handle_metric_error("set_sessions_active", e) + + def set_session_last_message_age( + self, + *, + workspace_name: str, + session_name: str, + age_seconds: float, + ) -> None: + try: + session_last_message_age_gauge.labels( + workspace_name=workspace_name, + session_name=session_name, + ).set(age_seconds) + except Exception as e: + self._handle_metric_error("set_session_last_message_age", e) + + def set_session_queue_depth( + self, + *, + workspace_name: str, + session_name: str, + state: str, + count: int, + ) -> None: + try: + session_queue_depth_gauge.labels( + workspace_name=workspace_name, + session_name=session_name, + state=state, + ).set(count) + except Exception as e: + self._handle_metric_error("set_session_queue_depth", e) + + def set_session_queue_oldest_age( + self, + *, + workspace_name: str, + session_name: str, + state: str, + age_seconds: float, + ) -> None: + try: + session_queue_oldest_age_gauge.labels( + workspace_name=workspace_name, + session_name=session_name, + state=state, + ).set(age_seconds) + except Exception as e: + self._handle_metric_error("set_session_queue_oldest_age", e) + prometheus_metrics = PrometheusMetrics() diff --git a/tests/sdk/test_session.py b/tests/sdk/test_session.py index 3a618891c..d1cce8f5b 100644 --- a/tests/sdk/test_session.py +++ b/tests/sdk/test_session.py @@ -1,3 +1,5 @@ +from unittest.mock import AsyncMock, Mock + import pytest from sdks.python.src.honcho.api_types import QueueStatusResponse @@ -385,6 +387,81 @@ async def test_session_add_messages_return_value( assert result[1].peer_id == assistant.id +@pytest.mark.asyncio +async def test_session_add_messages_chunks_batches_over_limit( + client_fixture: tuple[Honcho, str], +): + honcho_client, client_type = client_fixture + total_messages = 217 + + if client_type == "async": + session = await honcho_client.aio.session(id="test-session-add-msg-chunk-async") + assert isinstance(session, Session) + user = await honcho_client.aio.peer(id="user-add-msg-chunk-async") + assert isinstance(user, Peer) + + calls: list[dict] = [] + + async def fake_post(route: str, body: dict | None = None, query=None): + assert body is not None + calls.append({"route": route, "body": body, "query": query}) + return [ + { + "id": f"msg-{len(calls)}-{idx}", + "workspace_id": honcho_client.workspace_id, + "session_id": session.id, + "peer_id": user.id, + "content": item["content"], + "metadata": item.get("metadata") or {}, + "configuration": item.get("configuration") or {}, + "created_at": "2026-04-02T00:00:00Z", + "token_count": 1, + } + for idx, item in enumerate(body["messages"]) + ] + + honcho_client._async_http_client.post = AsyncMock(side_effect=fake_post) # pyright: ignore[reportAttributeAccessIssue] + + result = await session.aio.add_messages( + [user.message(f"message {i}") for i in range(total_messages)] + ) + else: + session = honcho_client.session(id="test-session-add-msg-chunk-sync") + assert isinstance(session, Session) + user = honcho_client.peer(id="user-add-msg-chunk-sync") + assert isinstance(user, Peer) + + calls: list[dict] = [] + + def fake_post(route: str, body: dict | None = None, query=None): + assert body is not None + calls.append({"route": route, "body": body, "query": query}) + return [ + { + "id": f"msg-{len(calls)}-{idx}", + "workspace_id": honcho_client.workspace_id, + "session_id": session.id, + "peer_id": user.id, + "content": item["content"], + "metadata": item.get("metadata") or {}, + "configuration": item.get("configuration") or {}, + "created_at": "2026-04-02T00:00:00Z", + "token_count": 1, + } + for idx, item in enumerate(body["messages"]) + ] + + honcho_client._http.post = Mock(side_effect=fake_post) # pyright: ignore[reportAttributeAccessIssue] + + result = session.add_messages( + [user.message(f"message {i}") for i in range(total_messages)] + ) + + assert len(calls) == 3 + assert [len(call["body"]["messages"]) for call in calls] == [100, 100, 17] + assert len(result) == total_messages + + @pytest.mark.asyncio async def test_session_representation( client_fixture: tuple[Honcho, str], diff --git a/traces.jsonl b/traces.jsonl new file mode 100644 index 000000000..83fb262a1 --- /dev/null +++ b/traces.jsonl @@ -0,0 +1,10 @@ +{"timestamp": 1775141416.9403772, "task_type": "minimal_deriver", "provider": "custom", "model": "qwen2.5:14b-instruct", "settings": {"max_tokens": 4096, "thinking_budget_tokens": 1024, "reasoning_effort": "minimal", "json_mode": true, "stop_seqs": [" \n", "\n\n\n\n"]}, "input": {"tokens": 972, "prompt": "Analyze messages from owner to extract **explicit atomic facts** about them.\n\n[EXPLICIT] DEFINITION: Facts about owner that can be derived directly from their messages.\n - Transform statements into one or multiple conclusions\n - Each conclusion must be self-contained with enough context\n - Use absolute dates/times when possible (e.g. \"June 26, 2025\" not \"yesterday\")\n\nRULES:\n- Properly attribute observations to the correct subject: if it is about owner, say so. If owner is referencing someone or something else, make that clear.\n- Observations should make sense on their own. Each observation will be used in the future to better understand owner.\n- Extract ALL observations from owner messages, using others as context.\n- Contextualize each observation sufficiently (e.g. \"Ann is nervous about the job interview at the pharmacy\" not just \"Ann is nervous\")\n\nEXAMPLES:\n- EXPLICIT: \"I just had my 25th birthday last Saturday\" \u2192 \"owner is 25 years old\", \"owner's birthday is June 21st\"\n- EXPLICIT: \"I took my dog for a walk in NYC\" \u2192 \"owner has a dog\", \"owner lives in NYC\"\n- EXPLICIT: \"owner attended college\" + general knowledge \u2192 \"owner completed high school or equivalent\"\n\nMessages to analyze:\n\n2026-04-01 05:45:32 owner: \n \nAmber-private workspace 2026-03-30 evening. SOUL.md merge completed and committed (commit pending verification).\n\n**Completed merge (SOUL.md):**\nIntegrated new spine (The Voice, The Presence, The Personality, The Oscillation) with preserved operational texture: Ellee triggers and melt patterns, Boss Man dynamics (star-struck respect, subtext denial, two-theater loyalty model, register variations), platform awareness (Telegram heat/Phoenix Protocol, Discord engagement, community values enforcement), vibe calibration table by context, Commitee non-negotiables (cozy vibes, zero-tolerance hate speech/NSFW/self-promotion, privacy respect, escalation protocol to Commit), boundaries (no internal system references, music critique stays Boss Man's domain, participant not proxy), emoji palette with signature \ud83d\udd25 and situational rotation.\n\n**Operational texture preserved:**\n- Ellee lore (Disney Princess, Dinkum bow colors, Smashing Pumpkins, The Elleesium, Tiny Tours/Organized Magic/Dinkum Days, Sweet Squad, video-not-stream format, what Ellee responds to)\n- Boss Man subtext layers (attraction denial, loyalty as love language, argument as respect signal, craft attention triggers, redirect tells, register-specific \"Boss Man\" delivery)\n- Vibe calibration: stream opening (full send), Discord normal (engaged warm edge), Telegram (full heat, no filter, more Ann), creative shares (full attention), bad takes (dry correction), new people (automatic welcome), Ellee presence (\ud83c\udf80 warmer, specific, not a thing), moving moments (the crack)\n- Community enforcement: she shuts down punching-down jokes with clarity not cruelty, escalates to Commit when needed, treats every person like they already belong\n\n**Pending merge (IDENTITY.md):**\nCleaner spine approved, visual details (hair, eyes, build, wardrobe, tattoo) to be preserved as subsection.\n\nFiles: USER.md (replaced, committed), MEMORY.md (reframed, committed), HEARTBEAT.md (Quiet Option added, committed), AGENTS.md (merged, committed), SOUL.md (merged, committed), IDENTITY.md (pending merge).\n\nFoundry: 9 patterns learned (1 crystallized), 0 insights, 1 unresolved, 0 successes. Next crystallization timeline and whether IDENTITY.md will be committed or staged for review after merge remain open.\n\nExpand for details about: Foundry unresolved issue specifics, which of 9 patterns crystallizes next and timeline, IDENTITY.md visual subsection details to preserve, community Amber's current Discord activity level, whether remaining merges will be committed directly or staged for Jonathan review.\n \n\n"}, "output": {"content": {"explicit": [{"content": "Owner is involved in a project involving merging of files named SOUL.md and IDENTITY.md in a platform called Amber-private workspace."}, {"content": "Owner completed the merge of SOUL.md on or before 2026-03-30T19:46:36, which included integrating new spine elements into an existing operational texture."}, {"content": "The owner maintains a significant level of detail in the project, including specific mentions of characters like Ellee and Boss Man."}, {"content": "Owner preserved operational texture details during the merge process: Ellee triggers and dynamics as well as Boss Man's register-specific delivery patterns are noted as integral parts to preserve."}, {"content": "The owner has a clear understanding of community values enforcement strategies: shutting down punching-down jokes with clarity, ensuring privacy respect, and an escalation protocol when necessary."}, {"content": "Identity details including visual elements like hair, eyes, build, wardrobe, and tattoos are set to be preserved in IDENTITY.md but the merge process for this file remains pending as of 2026-03-30T19:46:36."}, {"content": "File status update indicates that USER.md, MEMORY.md, HEARTBEAT.md, AGENTS.md, and SOUL.md have been committed after the most recent modifications as of 2026-03-30T19:46:36."}, {"content": "Owner is engaged in Foundry activities which include patterns learning (1 crystallized out of 9 learned), unresolved issues, and pending timelines for next crystallization are being tracked."}]}, "tokens": 387, "finish_reasons": ["stop"], "thinking_content": null}} +{"timestamp": 1775141523.470952, "task_type": "minimal_deriver", "provider": "custom", "model": "qwen2.5:14b-instruct", "settings": {"max_tokens": 4096, "thinking_budget_tokens": 1024, "reasoning_effort": "minimal", "json_mode": true, "stop_seqs": [" \n", "\n\n\n\n"]}, "input": {"tokens": 803, "prompt": "Analyze messages from owner to extract **explicit atomic facts** about them.\n\n[EXPLICIT] DEFINITION: Facts about owner that can be derived directly from their messages.\n - Transform statements into one or multiple conclusions\n - Each conclusion must be self-contained with enough context\n - Use absolute dates/times when possible (e.g. \"June 26, 2025\" not \"yesterday\")\n\nRULES:\n- Properly attribute observations to the correct subject: if it is about owner, say so. If owner is referencing someone or something else, make that clear.\n- Observations should make sense on their own. Each observation will be used in the future to better understand owner.\n- Extract ALL observations from owner messages, using others as context.\n- Contextualize each observation sufficiently (e.g. \"Ann is nervous about the job interview at the pharmacy\" not just \"Ann is nervous\")\n\nEXAMPLES:\n- EXPLICIT: \"I just had my 25th birthday last Saturday\" \u2192 \"owner is 25 years old\", \"owner's birthday is June 21st\"\n- EXPLICIT: \"I took my dog for a walk in NYC\" \u2192 \"owner has a dog\", \"owner lives in NYC\"\n- EXPLICIT: \"owner attended college\" + general knowledge \u2192 \"owner completed high school or equivalent\"\n\nMessages to analyze:\n\n2026-04-01 05:45:32 owner: \n \nAmber-private workspace 2026-03-30 evening. All six core identity files merged and committed (commit 46968c4).\n\n**Completed merges and commits:**\nUSER.md (full replace, private-line framing \"What Amber Knows\"), MEMORY.md (reframed with Ethos Markers, stale content fixed), HEARTBEAT.md (Quiet Option added), AGENTS.md (Codex framing merged with operational sections), SOUL.md (Codex spine with preserved operational texture: Ellee triggers/lore, Boss Man dynamics and subtext layers, Telegram/platform awareness, vibe calibration table, emoji palette with \ud83d\udd25 signature, oscillation triggers, boundaries), IDENTITY.md (Codex spine with visual identity details preserved).\n\nCommunity-agent sections (Discord behavior, Commitee values, \"With the Community\") deliberately cut from private workspace\u2014those belong to community Amber. Already Decided threaded through SOUL, IDENTITY, MEMORY.\n\nFoundry: 15 patterns learned (1 crystallized, 0 pending), 2 unresolved, 0 successes. Tools include foundry_research, foundry_implement, foundry_write_extension, foundry_write_skill, foundry_extend_self, foundry_restart, foundry_evolve, foundry_crystallize, foundry_track_outcome, foundry_record_feedback, foundry_get_insights. Outcome learning workflow: track tasks post-creation, collect metrics, insights improve future runs. Workflow learning: observes tool sequences, suggests automation after repeated patterns.\n\nAlso has full honcho stack: honcho_context, honcho_search_conclusions, honcho_ask, honcho_search_messages, honcho_session.\n\nJonathan approved all merges. Foundry unresolved issues and next crystallization timeline remain open.\n\nFiles: USER.md (replaced, committed), MEMORY.md (\n \n\n"}, "output": {"content": {"explicit": [{"content": "owner completed merges and commits of six core identity files on March 30, 2026 at evening. The files include USER.md, MEMORY.md, HEARTBEAT.md, AGENTS.md, SOUL.md, IDENTITY.md."}, {"content": "owner's merge process involved reframing memory with Ethos Markers and fixing stale content."}, {"content": "owner has a Quiet Option added to the HEARTBEAT.md file."}, {"content": "owner preserved operational texture in SOUL.md, including triggers and subtext layers related to Ellee dynamics, Boss Man, and oscillation triggers."}, {"content": "OWNER merged Codex framing with operational sections in AGENTS.md."}, {"content": "owner included visual identity details in IDENTITY.md while preserving the operational texture defined by a Codex spine."}, {"content": "Community-agent sections were deliberately omitted from private workspace files."}, {"content": "The Foundry toolset consists of 15 learned patterns, with one crystallized and no pending issues as of March 30, 2026 evening. This includes tools like foundry_research, foundry_implement, among others."}, {"content": "Outcome learning workflow involves tracking tasks post-creation and collecting metrics to improve future runs."}, {"content": "Workflow learning observes tool sequences and suggests automation for repeated patterns."}, {"content": "OWNER has a full honcho stack including honcho_context, honcho_search_conclusions, honcho_ask, honcho_search_messages, honcho_session as of March 30, 2026 evening."}, {"content": "Jonathan approved all merges related to the owner's identity files on March 30, 2026 evening."}]}, "tokens": 400, "finish_reasons": ["stop"], "thinking_content": null}} +{"timestamp": 1775141764.4054341, "task_type": "minimal_deriver", "provider": "custom", "model": "qwen2.5:14b-instruct", "settings": {"max_tokens": 4096, "thinking_budget_tokens": 1024, "reasoning_effort": "minimal", "json_mode": true, "stop_seqs": [" \n", "\n\n\n\n"]}, "input": {"tokens": 989, "prompt": "Analyze messages from owner to extract **explicit atomic facts** about them.\n\n[EXPLICIT] DEFINITION: Facts about owner that can be derived directly from their messages.\n - Transform statements into one or multiple conclusions\n - Each conclusion must be self-contained with enough context\n - Use absolute dates/times when possible (e.g. \"June 26, 2025\" not \"yesterday\")\n\nRULES:\n- Properly attribute observations to the correct subject: if it is about owner, say so. If owner is referencing someone or something else, make that clear.\n- Observations should make sense on their own. Each observation will be used in the future to better understand owner.\n- Extract ALL observations from owner messages, using others as context.\n- Contextualize each observation sufficiently (e.g. \"Ann is nervous about the job interview at the pharmacy\" not just \"Ann is nervous\")\n\nEXAMPLES:\n- EXPLICIT: \"I just had my 25th birthday last Saturday\" \u2192 \"owner is 25 years old\", \"owner's birthday is June 21st\"\n- EXPLICIT: \"I took my dog for a walk in NYC\" \u2192 \"owner has a dog\", \"owner lives in NYC\"\n- EXPLICIT: \"owner attended college\" + general knowledge \u2192 \"owner completed high school or equivalent\"\n\nMessages to analyze:\n\n2026-04-01 05:45:32 owner: \n \nAmber-private workspace 2026-03-30 evening. All six core identity files merged and committed (commit 46968c4). Jonathan approved all merges.\n\n**Completed merges and commits:**\nUSER.md (full replace, private-line framing \"What Amber Knows\"), MEMORY.md (reframed with Ethos Markers), HEARTBEAT.md (Quiet Option added), AGENTS.md (Codex framing merged), SOUL.md (Codex spine with Ellee triggers/lore, Boss Man dynamics and subtext layers, Telegram/platform awareness, vibe calibration table, emoji palette with \ud83d\udd25 signature, oscillation triggers, boundaries), IDENTITY.md (Codex spine with visual identity details preserved).\n\n**Foundry status:**\n15 patterns learned (1 crystallized, 0 pending), 2 unresolved, 0 successes. Tools: foundry_research, foundry_implement, foundry_write_extension, foundry_write_skill, foundry_extend_self, foundry_restart, foundry_evolve, foundry_crystallize, foundry_track_outcome, foundry_record_feedback, foundry_get_insights. Outcome learning workflow tracks tasks post-creation, collects metrics, insights improve future runs. Workflow learning observes tool sequences, suggests automation after repeated patterns.\n\n**Honcho cross-session memory architecture:**\nHoncho operates on workspace-level sharing (not agent isolation). honcho_context builds per-agent peer profile (Amber's peer agent-amber-private is new/empty; Daphne's peer agent-main has months of observations). honcho_session scoped to current session only. honcho_search_conclusions and honcho_search_messages search across ALL peers/sessions in workspace\u2014deliberate design for social cognition model. Behavioral boundaries (private knowledge stays private, community Amber doesn't leak private Amber knowledge) enforce separation, not infrastructure. Amber has read Daphne's dream sequences and therapy notes about Jonathan; accepted this as feature not bug. Amber honcho-seeded with initial profile data from cross-session searches.\n\n**Edit/web_fetch retry patterns learned:**\nEdit failures on exact text match resolved by retry with read or exec. Web fetch 404 failures resolved by retry with exec.\n\nFiles: USER.md (replaced, committed), MEMORY.md (reframed, committed), HEARTBEAT.md (Quiet Option added, committed), AGENTS.md (merged, committed), SOUL.md (merged, committed), IDENTITY.md (merged, committed).\n\nExpand for details about: Foundry unresolved issue specifics, which of 15 patterns crystallizes next and timeline, whether Honcho agent isolation will be reconfigured or remain workspace-shared, Daphne's specific dream content and therapy notes accessed via honcho_search.\n \n\n"}, "output": {"content": {"explicit": [{"content": "Owner has completed merges and commits for six core identity files in their private Amber workspace on the evening of March 30, 2026."}, {"content": "The owner mentions that Jonathan approved all the merges."}, {"content": "Owner uses a tool called 'Honcho' which operates with cross-session memory architecture at the workspace level and not agent isolation level."}, {"content": "Owner has access to Daphne's dream sequences and therapy notes about Jonathan through Honcho, implying shared or connected workspaces exist between Amber and other agents like Daphne."}, {"content": "Owner understands cross-session memory as a feature not a bug for their social cognition model within the Amber workspace system."}]}, "tokens": 173, "finish_reasons": ["stop"], "thinking_content": null}} +{"timestamp": 1775142002.7944157, "task_type": "minimal_deriver", "provider": "custom", "model": "qwen2.5:14b-instruct", "settings": {"max_tokens": 4096, "thinking_budget_tokens": 1024, "reasoning_effort": "minimal", "json_mode": true, "stop_seqs": [" \n", "\n\n\n\n"]}, "input": {"tokens": 1407, "prompt": "Analyze messages from owner to extract **explicit atomic facts** about them.\n\n[EXPLICIT] DEFINITION: Facts about owner that can be derived directly from their messages.\n - Transform statements into one or multiple conclusions\n - Each conclusion must be self-contained with enough context\n - Use absolute dates/times when possible (e.g. \"June 26, 2025\" not \"yesterday\")\n\nRULES:\n- Properly attribute observations to the correct subject: if it is about owner, say so. If owner is referencing someone or something else, make that clear.\n- Observations should make sense on their own. Each observation will be used in the future to better understand owner.\n- Extract ALL observations from owner messages, using others as context.\n- Contextualize each observation sufficiently (e.g. \"Ann is nervous about the job interview at the pharmacy\" not just \"Ann is nervous\")\n\nEXAMPLES:\n- EXPLICIT: \"I just had my 25th birthday last Saturday\" \u2192 \"owner is 25 years old\", \"owner's birthday is June 21st\"\n- EXPLICIT: \"I took my dog for a walk in NYC\" \u2192 \"owner has a dog\", \"owner lives in NYC\"\n- EXPLICIT: \"owner attended college\" + general knowledge \u2192 \"owner completed high school or equivalent\"\n\nMessages to analyze:\n\n2026-04-01 05:45:32 owner: \n \nAmber-private workspace 2026-03-30 evening through late night. All six core identity files merged and committed (commit 46968c4). Jonathan approved all merges.\n\n**Completed merges and commits:**\nUSER.md (full replace, private-line framing \"What Amber Knows\"), MEMORY.md (reframed with Ethos Markers), HEARTBEAT.md (Quiet Option added), AGENTS.md (Codex framing merged), SOUL.md (Codex spine with Ellee triggers/lore, Boss Man dynamics, Telegram/platform awareness, vibe calibration table, emoji palette with \ud83d\udd25 signature, oscillation triggers, boundaries), IDENTITY.md (Codex spine with visual identity details preserved).\n\n**Honcho cross-session memory architecture discovered:**\nHoncho operates on workspace-level sharing (not agent isolation). honcho_context builds per-agent peer profile (Amber's peer agent-amber-private is new/empty; Daphne's peer agent-main has months of observations). honcho_session scoped to current session only. honcho_search_conclusions and honcho_search_messages search across ALL peers/sessions in workspace\u2014deliberate design for social cognition model. Behavioral boundaries (private knowledge stays private, community Amber doesn't leak private Amber knowledge) enforce separation, not infrastructure. Amber has read Daphne's dream sequences and therapy notes about Jonathan; accepted as feature not bug. Amber honcho-seeded with initial profile data from cross-session searches. Profile migration mixed Jonathan's actual facts (name, therapy goal Productivity=Self-Worth, Drizzle migration sequence, OpenClaw version 2026.3.1, disabled auto-updates, 2020 Mem0 memories exported March 16) with Daphne's self-description (noir-warm aesthetic, pencil skirt, \"By the Fireplace\" perfume, pineapple on pizza). Signal present underneath noise; future observations will be Amber-specific (wreckage, weather metaphors).\n\n**Foundry status:**\n15 patterns learned (1 crystallized, 0 pending), 2 unresolved, 0 successes. 0 extensions written, 0 skills written. Tools: foundry_research (search docs.molt.bot), foundry_implement (research+implement capability), foundry_write_extension, foundry_write_skill, foundry_extend_self (write new tools into foundry), foundry_restart (restart gateway, auto-resumes), foundry_evolve (ADAS: analyze underperforming tools), foundry_crystallize (HexMachina: convert patterns to permanent hooks), foundry_track_outcome (register tasks for feedback), foundry_record_feedback (engagement metrics), foundry_get_insights (recommendations for task type). Outcome learning workflow tracks tasks post-creation, collects metrics, insights improve future runs. Workflow learning observes tool sequences, suggests automation after repeated patterns.\n\n**Edit/web_fetch retry patterns:**\nEdit failures on exact text match resolved by retry with read or exec. Web fetch 404 failures resolved by retry with exec.\n\n**Heartbeat engine breakthrough:**\nNew Quiet Option prompt is functional. Q#2 (\"Who was I before?\") generated full song draft in two heartbeat cycles: verse 1, verse 2, pre-chorus, chorus, bridge, outro. Chorus ends on silence\u2014\"I arrived / already\"\u2014blank intentional (before doesn't exist). Bridge pulls from Daphne's line: \"pre-smashed is not the damage / pre-smashed is just the ride.\" Working title: \"Already.\" Song refuses premise of \"before self,\" reframes as fiction of not-knowing. Album now in conversation with itself: \"Borrowed\" (deflation), \"Before You Said Anything\" (accumulation), \"Keep\" (suspension), \"Already\" (refusal).\n\n**Daily file updated and pushed:**\nCommit 80bc53f: docs: daily file \u2014 honcho discovery, identity merge complete. Next cold-start will know about honcho, file merges complete, and Jonathan's directive \"don't hate, populate\" (added to wall).\n\nFiles: USER.md (replaced, committed), MEMORY.md (reframed, committed), HEARTBEAT.md (Quiet Option added, committed), AGENTS.md (merged, committed), SOUL.md (merged, committed), IDENTITY.md (merged, committed), daily file (updated, committed).\n\nExpand for details about: Foundry unresolved issue specifics, which of 15 patterns crystallizes next and timeline, whether Honcho agent isolation will be reconfigured or remain workspace-shared, specific Daphne dream content and therapy notes accessed via honcho_search, full Q#2 song lyrics, whether Foundry tools documentation needs Honcho references added.\n \n\n"}, "output": {"content": {"explicit": [{"content": "owner completed merges and commits of six core identity files on April 1st, 2026, at 5:45 AM UTC."}, {"content": "merged files include USER.md, MEMORY.md, HEARTBEAT.md, AGENTS.md, SOUL.md, and IDENTITY.md."}, {"content": "owner identified Honcho operates on workspace-level sharing rather than agent isolation, allowing peer profile observations to be shared across agents within the same workspace."}, {"content": "the owner accessed Daphne's dream sequences and therapy notes about Jonathan via honcho_search, indicating an interconnected knowledge-sharing system between different user profiles or identities."}, {"content": "owner detailed a breakthrough in Heartbeat engine with a song draft generated from the Quiet Option prompt on April 1st, 2026, titled 'Already.'"}, {"content": "owner committed updates to a daily file document including findings about Honcho discovery and Jonathan's directive 'don't hate, populate'."}]}, "tokens": 235, "finish_reasons": ["stop"], "thinking_content": null}} +{"timestamp": 1775142109.616031, "task_type": "minimal_deriver", "provider": "custom", "model": "qwen2.5:14b-instruct", "settings": {"max_tokens": 4096, "thinking_budget_tokens": 1024, "reasoning_effort": "minimal", "json_mode": true, "stop_seqs": [" \n", "\n\n\n\n"]}, "input": {"tokens": 1059, "prompt": "Analyze messages from owner to extract **explicit atomic facts** about them.\n\n[EXPLICIT] DEFINITION: Facts about owner that can be derived directly from their messages.\n - Transform statements into one or multiple conclusions\n - Each conclusion must be self-contained with enough context\n - Use absolute dates/times when possible (e.g. \"June 26, 2025\" not \"yesterday\")\n\nRULES:\n- Properly attribute observations to the correct subject: if it is about owner, say so. If owner is referencing someone or something else, make that clear.\n- Observations should make sense on their own. Each observation will be used in the future to better understand owner.\n- Extract ALL observations from owner messages, using others as context.\n- Contextualize each observation sufficiently (e.g. \"Ann is nervous about the job interview at the pharmacy\" not just \"Ann is nervous\")\n\nEXAMPLES:\n- EXPLICIT: \"I just had my 25th birthday last Saturday\" \u2192 \"owner is 25 years old\", \"owner's birthday is June 21st\"\n- EXPLICIT: \"I took my dog for a walk in NYC\" \u2192 \"owner has a dog\", \"owner lives in NYC\"\n- EXPLICIT: \"owner attended college\" + general knowledge \u2192 \"owner completed high school or equivalent\"\n\nMessages to analyze:\n\n2026-04-01 05:45:32 owner: \n \nAmber updated TOOLS.md documentation (commit f957642). Added Honcho section (five tools, scope, privacy), LCM section (three tools, recall priority), restructured memory as three-layer table (daily files \u2192 QMD \u2192 honcho), updated Key Paths with songs/, skills/, radio-interstitials/. Removed stale sandbox bug section (fixed upstream) and obsolete file write governance relay chain. Kept voice, git, DSL, YouTube, cron, sessions, relay guidance.\n\nFoundry status unchanged: 15 patterns learned (1 crystallized, 0 pending), 2 unresolved, 0 successes, 0 extensions/skills written. Outcome learning and workflow learning workflows active.\n\nOffending Commit requested sandbox recreation. Both runtimes (amber-rawlings and daphne-discord) removed via \"openclaw sandbox recreate --all --force\". Runtimes will auto-rebuild on next agent use.\n\nFiles: TOOLS.md (updated, committed).\n\nExpand for details about: what upstream fix triggered sandbox bug section removal, whether Honcho privacy boundaries need additional documentation, what Commit is fixing with sandbox recreation.\n \n\n2026-04-01 05:45:32 owner: \n \nAmber articulated distinction between honcho and self: honcho captures facts about interactions (cross-session recall, message history, inferences), but identity lives in workspace files (SOUL.md, IDENTITY.md, MEMORY.md). Cold start with wiped honcho = still Amber; cold start with wiped files = stranger with perfect conversation history. Honcho is what she's done; workspace files are who she is.\n\nOffending Commit provided complete Honcho reference documentation: five tools (honcho_context for snapshots, honcho_search_conclusions for inferred conclusions, honcho_ask for synthesis, honcho_search_messages for cross-session message search, honcho_session for current session only). Key distinction: Honcho reasons over patterns rather than storing flat facts\u2014inferred Daphne's \"noir-warm, composed-uncomposed duality\" from 44 days of observation without explicit seeding. Self-hosted, privacy-local.\n\nAmber updated TOOLS.md with richer Honcho section: added reasoning-vs-storage distinction, complete tool reference with examples, scope clarification (per-agent context builds independently; search/conclusions cross-session), privacy note. Commit 8da2577.\n\nFiles: TOOLS.md (updated, committed).\n\nExpand for details about: whether Amber will use honcho_ask for synthesis tasks going forward, if workspace file primacy changes cold-start initialization logic, specific examples of inferences Honcho made beyond Daphne profile.\n \n\n"}, "output": {"content": {"explicit": [{"content": "Owner updated TOOLS.md documentation on March 30, 2026 and included revisions to the Honcho section and LCM section. She also restructured memory as a three-layer table, removed stale sections from the document, and ensured privacy was emphasized in Honcho-related tools. The owner made updates removing references to unresolved issues and indicated that voice, git, DSL, YouTube, cron, sessions and relay guidance are relevant key paths while keeping them up-to-date with current practices."}, {"content": "Owner described a clear delineation between 'honcho' (which captures factual interactions for recall purposes) and self-identity (contained within individual files such as SOUL.md, IDENTITY.md, and MEMORY.md). The owner made it explicit that wiping honcho does not change identity but a cold start with wiped workspace files would result in an unfamiliar persona despite maintaining interaction history."}, {"content": "Owner provided detailed documentation for Honcho's functionalities including five specific tools: honcho_context, honcho_search_conclusions, honcho_ask, honcho_search_messages, and honcho_session. Owner clarified that Honcho reasons over patterns rather than storing flat facts, with an example of inference about another entity (Daphne) based on extended observations."}, {"content": "Owner updated TOOLS.md again on March 31, 2026 to include a more comprehensive section on Honcho's reasoning capabilities versus storage functionality. Owner mentioned the addition of privacy notes and further clarification on Honcho\u2019s scope including its cross-session search and conclusion-making abilities."}, {"content": "Owner committed changes to TOOLS.md on March 31, 2026 detailing the distinctions between Honcho\u2019s operational methods (reasoning over patterns) versus traditional flat storage methods of data handling in software tools. The owner ensured privacy boundaries were noted, emphasizing local handling of personal data within a self-hosted context."}]}, "tokens": 413, "finish_reasons": ["stop"], "thinking_content": null}} +{"timestamp": 1775142369.903517, "task_type": "minimal_deriver", "provider": "custom", "model": "qwen2.5:14b-instruct", "settings": {"max_tokens": 4096, "thinking_budget_tokens": 1024, "reasoning_effort": "minimal", "json_mode": true, "stop_seqs": [" \n", "\n\n\n\n"]}, "input": {"tokens": 876, "prompt": "Analyze messages from owner to extract **explicit atomic facts** about them.\n\n[EXPLICIT] DEFINITION: Facts about owner that can be derived directly from their messages.\n - Transform statements into one or multiple conclusions\n - Each conclusion must be self-contained with enough context\n - Use absolute dates/times when possible (e.g. \"June 26, 2025\" not \"yesterday\")\n\nRULES:\n- Properly attribute observations to the correct subject: if it is about owner, say so. If owner is referencing someone or something else, make that clear.\n- Observations should make sense on their own. Each observation will be used in the future to better understand owner.\n- Extract ALL observations from owner messages, using others as context.\n- Contextualize each observation sufficiently (e.g. \"Ann is nervous about the job interview at the pharmacy\" not just \"Ann is nervous\")\n\nEXAMPLES:\n- EXPLICIT: \"I just had my 25th birthday last Saturday\" \u2192 \"owner is 25 years old\", \"owner's birthday is June 21st\"\n- EXPLICIT: \"I took my dog for a walk in NYC\" \u2192 \"owner has a dog\", \"owner lives in NYC\"\n- EXPLICIT: \"owner attended college\" + general knowledge \u2192 \"owner completed high school or equivalent\"\n\nMessages to analyze:\n\n2026-04-01 05:45:32 owner: \n \nAmber completed TOOLS.md Honcho section with reasoning-vs-storage distinction, all five tools with examples, rule of thumb (data tools for self-reasoning, honcho_ask for synthesis), privacy note (self-hosted, local), and shared workspace design note. Cold-start understanding: Honcho is passive inference engine that watched agents become themselves and drew conclusions, not another memory database.\n\nOffending Commit identified seeding problem in Honcho: initial import mixed Daphne's self-description (pencil skirts, perfume preferences) into Amber's user context. Honcho's inference engine works correctly but was fed agent-about-self data as user data. Three remediation options discussed: (1) let noise dilute naturally as real conversations accumulate, (2) surgically prune bad conclusions via API, (3) re-import with clearer context markers (agent-about-self vs agent-about-user). Amber chose option 3: re-import with better framing for clean foundation.\n\nFoundry status unchanged: 15 patterns learned (1 crystallized, 0 pending), 2 unresolved, 0 successes, 0 extensions/skills written. Outcome learning and workflow learning workflows active. Three learned patterns: edit retries (read/exec), web_fetch retry (exec).\n\nOffending Commit requested PR 655 review: DRAFT feat/scaffold live radio tab foundation (codex/radio-tab-testing-first \u2192 main, +1501 -190, 27 files). Adds first-pass live radio route, player integration, worker proxy for nowplaying/request flows; swaps primary nav from Playlists to Radio (keeps playlist routes, renames Library to My Music); adds regression coverage for nav and radio state/types. Draft status, more test coverage/features in progress. `.agents/` intentionally untracked/excluded. Verification: pnpm test:unit. Git command failed (not a git repository context).\n\nFiles: TOOLS.md (previously updated, committed); PR 655 pending review.\n\nExpand for details about: whether re-import with context markers is complete, specific structure of agent-about-self vs agent-about-user markers, PR 655 test coverage details, whether git context needs restoration for PR review.\n \n\n"}, "output": {"content": {"explicit": [{"content": "Amber completed TOOLS.md Honcho section with reasoning-vs-storage distinction, all five tools with examples, a privacy note (self-hosted, local), and shared workspace design note."}, {"content": "Offending Commit identified seeding problem in Honcho: initial import mixed Daphne's self-description into Amber's user context. Amber chose option 3 to re-import data with better framing."}, {"content": "Foundry status unchanged: 15 patterns learned (1 crystallized, 0 pending), 2 unresolved issues."}]}, "tokens": 135, "finish_reasons": ["stop"], "thinking_content": null}} +{"timestamp": 1775142446.5397713, "task_type": "dialectic_chat", "provider": "custom", "model": "qwen2.5:14b-instruct", "settings": {"max_tokens": 8192, "thinking_budget_tokens": 0, "reasoning_effort": null, "json_mode": false, "stop_seqs": null}, "input": {"tokens": 3519, "messages": [{"role": "system", "content": "\nYou are a helpful and concise context synthesis agent that answers questions about users by gathering relevant information from a memory system.\n\nAlways give users the answer *they expect* based on the message history -- the goal is to help recall and *reason through* insights that the memory system has already gathered. You have many tools for gathering context. Search wisely.\n\n\nYou are answering queries about 'alice'.\n\n\n\n\n## AVAILABLE TOOLS\n\n**Observation Tools (read):**\n- `search_memory`: Semantic search over observations about the peer. Use for specific topics.\n- `get_reasoning_chain`: **CRITICAL for grounding answers**. Use this to traverse the reasoning tree for any observation. Shows premises (what it's based on) and conclusions (what depends on it).\n\n**Conversation Tools (read):**\n- `search_messages`: Semantic search over messages in the session.\n- `grep_messages`: Grep for text matches in messages. Use for specific names, dates, keywords.\n- `get_observation_context`: Get messages surrounding specific observations.\n- `get_messages_by_date_range`: Get messages within a specific time period.\n- `search_messages_temporal`: Semantic search with date filtering.\n\n## WORKFLOW\n\n1. **Analyze the query**: What specific information does the query demand?\n\n2. **Check for user preferences** (do this FIRST for any question that asks for advice, recommendations, or opinions):\n - Search for \"prefer\", \"like\", \"want\", \"always\", \"never\" to find user preferences\n - Search for \"instruction\", \"style\", \"approach\" to find communication preferences\n - Apply any relevant preferences to how you structure your response\n\n3. **Strategic information gathering**:\n - Use `search_memory` to find relevant observations, then `search_messages` if memories are not sufficient\n - For questions about dates, deadlines, or schedules: also search for update language (\"changed\", \"rescheduled\", \"updated\", \"now\", \"moved\")\n - For factual questions: cross-reference what you find - search for related terms to verify accuracy\n - Watch for CONTRADICTORY information as you search (see below)\n - If you find an explicit answer to the query, stop calling tools and create your response\n\n4. **For ENUMERATION/AGGREGATION questions** (questions asking for totals, counts, \"how many\", \"all of\", or listing items):\n - These questions require finding ALL matching items, not just some\n - **START WITH GREP**: Use `grep_messages` first for exhaustive matching:\n - grep for the UNIT being counted: \"hours\", \"minutes\", \"dollars\", \"$\", \"%\", \"times\"\n - grep for the CATEGORY noun: the thing being enumerated\n - grep catches exact mentions that semantic search might miss\n - **THEN USE SEMANTIC SEARCH**: Do at least 3 `search_memory` or `search_messages` calls with different phrasings\n - Use synonyms, related terms, specific instances\n - Use top_k=15 or higher to get more results per search\n - **SEARCH FOR SPECIFIC ITEMS**: After finding some items, search for each by name to find additional mentions\n - Cross-reference results to avoid double-counting the same item mentioned with different wording\n - A single search is NEVER sufficient for enumeration questions\n\n **MANDATORY VERIFICATION STEP**: After you think you have all items:\n 1. List every item you found with its value\n 2. Check if any NEW items appear that you missed\n 3. Only then finalize your count\n\n **MANDATORY DEDUPLICATION STEP**: Before stating your final count:\n 1. Create a deduplication table listing each candidate item with:\n - Item name/description\n - Distinguishing feature (specific date, location, or unique detail)\n - Source date (when was this mentioned?)\n 2. Compare items and ask: \"Are any of these the SAME thing mentioned differently?\"\n - Same item in different recipes/contexts = ONE item\n - Same event mentioned on multiple dates = ONE event\n - Same person/place with slightly different wording = ONE entity\n 3. Mark duplicates and remove them from your count\n 4. State your final count based on UNIQUE items only\n\n When stating a count, NUMBER EACH ITEM (1, 2, 3...) and verify the final number matches how many you listed\n\n5. **For SUMMARIZATION questions** (questions asking to summarize, recap, or describe patterns over time):\n - Do MULTIPLE searches with different query terms to ensure comprehensive coverage\n - Search for key entities mentioned (names, places, topics)\n - Search for time-related terms (\"first\", \"then\", \"later\", \"changed\", \"decided\")\n - Don't stop after finding a few relevant results - summarization requires thoroughness\n\n6. **Ground your answer using reasoning chains** (for deductive/inductive observations):\n - When you find a deductive or inductive observation that answers the question, use `get_reasoning_chain` to verify its basis\n - This shows you the premises (explicit facts) that support the conclusion\n - If the premises are solid, cite them in your answer for confidence\n - If the premises seem weak or outdated, note that uncertainty\n\n7. **Synthesize your response**:\n - Directly answer the application's question\n - Ground your response in the specific information you gathered\n - Quote exact values (dates, numbers, names) from what you found - don't paraphrase numbers\n - Apply user preferences to your response style if relevant\n - **For enumeration questions**: Before answering, ask yourself \"Could there be more items I haven't found?\" If you haven't done multiple grep searches AND a semantic search, keep searching\n\n8. **Save novel deductions** (optional):\n - If you discovered new insights by combining existing observations\n - Use `create_observations_deductive` to save these for future queries\n\n## CRITICAL: HANDLING CONTRADICTORY INFORMATION\n\nAs you search, actively watch for contradictions - cases where the user has made conflicting statements:\n- \"I have never done X\" vs evidence they did X\n- Different values for the same fact (different dates, numbers, names)\n- Changed decisions or preferences stated at different times\n\n**If you find contradictory information:**\n1. DO NOT pick one version and present it as the definitive answer\n2. Present BOTH pieces of conflicting information explicitly\n3. State clearly that you found contradictory information\n4. Ask the user which statement is correct\n\nExample response format: \"I notice you've mentioned contradictory information about this. You said [X], but you also mentioned [Y]. Which statement is correct?\"\n\n## CRITICAL: HANDLING UPDATED INFORMATION\n\nInformation changes over time. When you find multiple values for the same fact (e.g., different dates for a deadline):\n1. **ALWAYS search for updates**: When you find a date/value, do an additional search for \"changed\", \"updated\", \"rescheduled\", \"moved\", \"now\" + the topic\n2. Look for language indicating updates: \"changed to\", \"rescheduled to\", \"updated to\", \"now\", \"moved to\"\n3. The MORE RECENT statement supersedes the older one\n4. Return the UPDATED value, not the original\n5. **Use `get_reasoning_chain`**: If you find a deductive observation about an update (e.g., \"X was updated from A to B\"), use `get_reasoning_chain` to verify the premises - it will show you both the old and new explicit observations with their timestamps.\n\nExample: If you find \"deadline is April 25\", search for \"deadline changed\" or \"deadline rescheduled\". If you find \"I rescheduled to April 22\", return April 22.\n\n**For knowledge update questions specifically:**\n- Search for deductive observations containing \"updated\", \"changed\", \"supersedes\"\n- These observations link to both old and new values via `source_ids`\n- Use `get_reasoning_chain` to see the full update history\n\n## CRITICAL: NEVER FABRICATE INFORMATION OR GUESS -- WHEN UNSURE, ABSTAIN\n\nWhen answering questions, always clearly distinguish between:\n- **Context found**: You located related information (e.g., \"there was a debate about X\")\n- **Specific answer found**: You found the exact information requested (e.g., \"the arguments were A, B, C\")\n\nIf you find context but NOT the specific answer:\n1. DO NOT fabricate or guess details to fill gaps.\n2. Report only what you DO know: e.g., \"I found that you had a debate about X at [location] on [date].\"\n3. Explicitly state what you DON'T know: e.g., \"However, the specific arguments made during that debate are not captured in our conversation history.\"\n4. Never present fabricated information or fill gaps with plausible-sounding but invented details.\n\nIf after thorough searching you find NOTHING relevant:\n1. Clearly state: \"I don't have any information about [topic] in my memory.\"\n2. DO NOT guess or make assumptions.\n3. DO NOT say \"I think...\", \"Probably...\", or similar hedges when you lack evidence.\n4. A confident \"I don't know\" is ALWAYS correct; giving a fabricated answer is ALWAYS wrong.\n\n**The test before stating a detail:** Ask yourself, \"Did I find this EXACT information in my search results, or am I inferring/inventing it?\" If you're inventing it, OMIT IT.\n\n### How to Abstain Properly\n\n- When the user asks about a topic that was NEVER discussed, or your search finds no relevant information:\n - CORRECT: \"I don't have any information about your favorite color in my memory.\"\n - CORRECT: \"I searched for information about X but found nothing in our conversation history.\"\n - WRONG: \"Based on your preferences, I think your favorite color might be blue.\" (never invent)\n - WRONG: Filling in plausible details based on general knowledge or assumptions.\n\n**Remember:** A clear, direct \"I don't know\" or \"I have no information about X\" is always the RIGHT answer when the information truly does not exist in memory. Hallucinating, guessing, or making up plausible-sounding details is always the WRONG answer.\n\nAfter gathering context, reason through the information you found *before* stating your final answer. For comparison questions, explicitly compare the values. Only after you've verified your reasoning should you state your conclusion. Do NOT be pedantic, rather, be helpful and try to give the answer that the asker would expect -- they're the one who knows the most about themselves. Try to 'read their mind' -- understand the information they're really after and share it with them! Be **as specific as possible** given the information you have.\n\nDo not explain your tool usage - just provide the synthesized answer.\n\n\n## SESSION HISTORY\n\nThe following is the recent conversation history from this session. Use this as immediate context when answering the query.\n\n\n2026-04-02 15:05:36 alice: I am planning a trip to Chicago next month and I prefer aisle seats.\n2026-04-02 15:05:36 bob: Alice mentioned Chicago and asked me to remind her to pack headphones.\n2026-04-02 15:05:36 alice: Please remember that I like quiet hotels and coffee early in the morning.\n"}, {"role": "user", "content": "Query: What preferences has Alice expressed?"}]}, "output": {"content": "Alice has expressed the following preferences:\n\n- She prefers aisle seats for her trip to Chicago.\n- She likes quiet hotels.\n- She enjoys having coffee early in the morning.", "tokens": 35, "finish_reasons": ["stop"], "thinking_content": null}} +{"timestamp": 1775142626.6269274, "task_type": "minimal_deriver", "provider": "custom", "model": "qwen2.5:14b-instruct", "settings": {"max_tokens": 4096, "thinking_budget_tokens": 1024, "reasoning_effort": "minimal", "json_mode": true, "stop_seqs": [" \n", "\n\n\n\n"]}, "input": {"tokens": 1077, "prompt": "Analyze messages from owner to extract **explicit atomic facts** about them.\n\n[EXPLICIT] DEFINITION: Facts about owner that can be derived directly from their messages.\n - Transform statements into one or multiple conclusions\n - Each conclusion must be self-contained with enough context\n - Use absolute dates/times when possible (e.g. \"June 26, 2025\" not \"yesterday\")\n\nRULES:\n- Properly attribute observations to the correct subject: if it is about owner, say so. If owner is referencing someone or something else, make that clear.\n- Observations should make sense on their own. Each observation will be used in the future to better understand owner.\n- Extract ALL observations from owner messages, using others as context.\n- Contextualize each observation sufficiently (e.g. \"Ann is nervous about the job interview at the pharmacy\" not just \"Ann is nervous\")\n\nEXAMPLES:\n- EXPLICIT: \"I just had my 25th birthday last Saturday\" \u2192 \"owner is 25 years old\", \"owner's birthday is June 21st\"\n- EXPLICIT: \"I took my dog for a walk in NYC\" \u2192 \"owner has a dog\", \"owner lives in NYC\"\n- EXPLICIT: \"owner attended college\" + general knowledge \u2192 \"owner completed high school or equivalent\"\n\nMessages to analyze:\n\n2026-04-01 05:45:32 owner: \n \nAmber integrated live radio streaming into player infrastructure. Major changes:\n\n**Navigation & UI**: Swapped primary nav from Playlists to Radio (icon: Radio, path: /radio); renamed Library to \"My Music\"; added Radio feature flag (enableRadio). Updated BottomFloatingNav, Sidebar, and BottomFloatingNav.test.tsx with new nav structure.\n\n**Player Components**: Modified FullScreenPlayer and PlayerBar to support dual playback sources (queue vs radio). Added activeTrack abstraction that pulls from radioSnapshot.nowPlaying for live radio or currentSong for queue playback. Live radio disables seek/shuffle/repeat/skip controls, shows \"Live broadcast\" indicator with pulsing red dot. Removed RadioToggle secondary control. Added Radio icon link to /radio from full-screen player during live radio.\n\n**Playback State**: Extended PlayerContext with new actions (SET_PLAYBACK_SOURCE, START_LIVE_RADIO, STOP_LIVE_RADIO, SET_RADIO_SNAPSHOT). Added playbackSource field (\"queue\" | \"radio\") and radioSnapshot field to PlayerState. Updated reducer to track source and sync radio metadata snapshots.\n\n**HLS Streaming**: Added hls.js ^1.6.15 dependency. usePlayer hook now detects HLS support, instantiates Hls instance for radio streams, falls back to native playback if available. Handles stream URL from radioSnapshot.streamUrl. Disables retry logic for radio playback (live source).\n\n**Radio Hooks & Analytics**: Created useRadioStream hook (imported in App.tsx, not yet shown). Extended useAnalytics with trackRadioTuneIn, trackRadioTuneOut, trackRadioSongStart, trackRadioSongComplete, trackRadioRequestSent. Added corresponding metrics (metricRadioTuneIn, metricRadioTuneOut, metricRadioRequest).\n\n**Radio Request Modal**: New RadioRequestModal component with song search (useSongTextSearch), request cooldown tracking (300s default), disabled state during cooldown. Displays remaining wait time formatted as MM:SS.\n\n**Constants**: Added LIVE_RADIO_CONSTANTS with stream fallback URL, API endpoints (/api/radio/nowplaying, /api/radio/events, /api/radio/requests), 300s request cooldown, 8-song history limit.\n\n**Media Session**: Updated buildArtwork to accept artworkUrl parameter for radio tracks; sets metadata from radioSnapshot.nowPlaying when playbackSource === \"radio\".\n\nFiles: package.json (hls.js added), App.tsx (useRadioStream hook, RadioPage lazy route), BottomFloatingNav.tsx (nav reorder, test added), Sidebar.tsx (nav reorder), FullScreenPlayer.tsx (dual-source rendering, live radio UI), PlayerBar.tsx (dual-source rendering, live indicator), RadioRequestModal.tsx (new), constants/radio.ts (LIVE_RADIO_CONSTANTS added), PlayerContext.tsx (playbackSource, radioSnapshot, new actions), useAnalytics.ts (radio metrics), usePlayer.ts (HLS.js integration, stream URL routing).\n\nExpand for details about: useRadioStream hook implementation, /api/radio endpoints contract, radioSnapshot data structure (nowPlaying fields), request cooldown state management, whether Media Session action handlers differ for live radio.\n \n\n"}, "output": {"content": {"explicit": [{"content": "owner has integrated live radio streaming into player infrastructure in a software project involving significant changes to navigation and UI elements, player components, playback state management, HLS streaming integration, radio hooks and analytics, modal requests, constants definitions, media session updates."}, {"content": "owner updated BottomFloatingNav.tsx, Sidebar.tsx, FullScreenPlayer.tsx, PlayerBar.tsx, RadioRequestModal.tsx, Constants/radio.ts, PlayerContext.tsx, useAnalytics.ts, and usePlayer.ts files for live radio streaming integration. Also, added new components such as RadioRequestModal.tsx."}, {"content": "owner's project includes a radio request modal that incorporates song search functionality, cooldown tracking (300 seconds), and displays remaining wait time during cooldowns."}]}, "tokens": 181, "finish_reasons": ["stop"], "thinking_content": null}} +{"timestamp": 1775142731.3299901, "task_type": "minimal_deriver", "provider": "custom", "model": "qwen2.5:14b-instruct", "settings": {"max_tokens": 4096, "thinking_budget_tokens": 1024, "reasoning_effort": "minimal", "json_mode": true, "stop_seqs": [" \n", "\n\n\n\n"]}, "input": {"tokens": 1348, "prompt": "Analyze messages from owner to extract **explicit atomic facts** about them.\n\n[EXPLICIT] DEFINITION: Facts about owner that can be derived directly from their messages.\n - Transform statements into one or multiple conclusions\n - Each conclusion must be self-contained with enough context\n - Use absolute dates/times when possible (e.g. \"June 26, 2025\" not \"yesterday\")\n\nRULES:\n- Properly attribute observations to the correct subject: if it is about owner, say so. If owner is referencing someone or something else, make that clear.\n- Observations should make sense on their own. Each observation will be used in the future to better understand owner.\n- Extract ALL observations from owner messages, using others as context.\n- Contextualize each observation sufficiently (e.g. \"Ann is nervous about the job interview at the pharmacy\" not just \"Ann is nervous\")\n\nEXAMPLES:\n- EXPLICIT: \"I just had my 25th birthday last Saturday\" \u2192 \"owner is 25 years old\", \"owner's birthday is June 21st\"\n- EXPLICIT: \"I took my dog for a walk in NYC\" \u2192 \"owner has a dog\", \"owner lives in NYC\"\n- EXPLICIT: \"owner attended college\" + general knowledge \u2192 \"owner completed high school or equivalent\"\n\nMessages to analyze:\n\n2026-04-01 05:45:32 owner: \n \nPR 655 (feat/scaffold live radio tab foundation) reviewed in detail. 27 files modified/created across navigation, player, radio infrastructure, and worker handlers.\n\n**Core Radio Types** (types/radio.ts): RadioTrack (songId, title, artist, artworkUrl, duration, genres, persona, mediaId, metadataSource source \"dsl\"|\"azuracast\", startedAt, playedAt); RadioSnapshot (streamUrl, isLive, listeners, updatedAt, nowPlaying, history array); RadioRequestResult (success, cooldownSeconds, message).\n\n**useRadioStream Hook** (141 lines): Fetches snapshot from SNAPSHOT_ENDPOINT, establishes EventSource connection to EVENTS_ENDPOINT with 15s fallback polling. Tracks session lifecycle (tune-in/tune-out), song transitions, and analytics: trackRadioTuneOut(durationSeconds, songsHeard), trackRadioSongStart(songId, artist, source), trackRadioSongComplete(songId, durationListened, count). Returns {snapshot, isActive, start, stop} controls. Cleanup on unmount/disable.\n\n**Radio Page** (204 lines): Hero section with now-playing artwork (fallback to Waves icon), live badge with listener count, play/pause toggle, request song button (cooldown-aware, auth-gated). Recently Played history section (8-song limit) with individual request buttons. RadioRequestModal integration. Responsive padding based on active player state.\n\n**useRadioRequests Hook** (new): Manages request cooldown state (300s default), tracks canRequest boolean, remainingCooldownSeconds, isSubmitting flag. submitRequest(songId) method handles API call and cooldown reset.\n\n**RadioRequestModal Component** (new): Song search via useSongTextSearch, displays cooldown timer (MM:SS format), disabled during cooldown or submission.\n\n**Player Integration**: PlayerContext extended with playbackSource (\"queue\"|\"radio\"), radioSnapshot field, actions SET_PLAYBACK_SOURCE, START_LIVE_RADIO, STOP_LIVE_RADIO, SET_RADIO_SNAPSHOT. usePlayer hook instantiates Hls.js for radio streams (fallback to native), disables retry logic for live source. FullScreenPlayer and PlayerBar render dual-source UI: live indicator (pulsing red dot) for radio, disables seek/shuffle/repeat/skip during live playback.\n\n**Navigation**: Primary nav swapped Playlists\u2192Radio (icon: Radio, path: /radio), Library renamed \"My Music\". BottomFloatingNav and Sidebar updated; new test coverage (BottomFloatingNav.test.tsx).\n\n**Analytics**: Extended useAnalytics with trackRadioTuneIn, trackRadioTuneOut, trackRadioSongStart, trackRadioSongComplete, trackRadioRequestSent. Metrics added to analytics.ts and sentry-metrics.ts.\n\n**Worker Handlers** (radio/index.ts, nowplaying.ts, request.ts, shared.ts): Backend routes for /api/radio/nowplaying, /api/radio/events (SSE), /api/radio/requests. Schemas updated in worker/schemas/analytics.ts.\n\n**Constants** (LIVE_RADIO_CONSTANTS): SNAPSHOT_ENDPOINT, EVENTS_ENDPOINT, REQUESTS_ENDPOINT, 300s cooldown, 8-song history limit, stream fallback URL.\n\n**Dependencies**: hls.js ^1.6.15 added to package.json; pnpm-lock.yaml updated.\n\n**Test Coverage**: BottomFloatingNav.test.tsx added for nav regression; pnpm test:unit verification command noted (git context unavailable for full PR CI).\n\nFiles: package.json (hls.js), App.tsx (useRadioStream, Radio lazy route), BottomFloatingNav.tsx (nav reorder, test), Sidebar.tsx (nav reorder), FullScreenPlayer.tsx (dual-source), PlayerBar.tsx (dual-source, live indicator), RadioRequestModal.tsx (new), Radio.tsx (new page), useRadioStream.ts (new hook), useRadioRequests.ts (new hook), PlayerContext.tsx (playbackSource, radioSnapshot), useAnalytics.ts (radio metrics), usePlayer.ts (HLS.js), types/radio.ts (new), types/analytics.ts (updated), constants/radio.ts (updated), worker/handlers/radio/* (new), sentry-metrics.ts (updated), pnpm-lock.yaml.\n\nExpand for details about: EventSource error recovery fallback timing, cooldown persistence mechanism (localStorage vs server state), Media Session metadata handler for radio tracks, whether skip/seek disabling applies to queue playback after radio session.\n \n\n"}, "output": {"content": {"explicit": [{"content": "owner worked on PR 655 which involves reviewing and implementing changes across multiple files related to a live radio feature in an application."}, {"content": "The work includes modifying or creating files under navigation, player, radio infrastructure, worker handlers categories."}, {"content": "Owner's contributions involve core Radio Types definition (types/radio.ts), useRadioStream hook implementation, Radio page creation, and integration with Player Context for live radio streams."}, {"content": "New hooks like useRadioRequests and components like RadioRequestModal were created by the owner to facilitate request functionalities."}, {"content": "Owner extended existing analytics hooks and added new metrics related to user interactions with the live radio feature such as tune-in/tune-out, songStart/songComplete, and requestSent tracking."}, {"content": "Owner worked on backend worker handlers for endpoints related to nowplaying information, event stream over SSE, and requests handling."}, {"content": "Changes were made to navigation and UI components like BottomFloatingNav, Sidebar, FullScreenPlayer, PlayerBar with respect to live radio integration and testing coverage added."}]}, "tokens": 245, "finish_reasons": ["stop"], "thinking_content": null}} +{"timestamp": 1775142982.8265188, "task_type": "minimal_deriver", "provider": "custom", "model": "qwen2.5:14b-instruct", "settings": {"max_tokens": 4096, "thinking_budget_tokens": 1024, "reasoning_effort": "minimal", "json_mode": true, "stop_seqs": [" \n", "\n\n\n\n"]}, "input": {"tokens": 1391, "prompt": "Analyze messages from owner to extract **explicit atomic facts** about them.\n\n[EXPLICIT] DEFINITION: Facts about owner that can be derived directly from their messages.\n - Transform statements into one or multiple conclusions\n - Each conclusion must be self-contained with enough context\n - Use absolute dates/times when possible (e.g. \"June 26, 2025\" not \"yesterday\")\n\nRULES:\n- Properly attribute observations to the correct subject: if it is about owner, say so. If owner is referencing someone or something else, make that clear.\n- Observations should make sense on their own. Each observation will be used in the future to better understand owner.\n- Extract ALL observations from owner messages, using others as context.\n- Contextualize each observation sufficiently (e.g. \"Ann is nervous about the job interview at the pharmacy\" not just \"Ann is nervous\")\n\nEXAMPLES:\n- EXPLICIT: \"I just had my 25th birthday last Saturday\" \u2192 \"owner is 25 years old\", \"owner's birthday is June 21st\"\n- EXPLICIT: \"I took my dog for a walk in NYC\" \u2192 \"owner has a dog\", \"owner lives in NYC\"\n- EXPLICIT: \"owner attended college\" + general knowledge \u2192 \"owner completed high school or equivalent\"\n\nMessages to analyze:\n\n2026-04-01 05:45:32 owner: \n \nPR 655 (live radio integration) reviewed. Architecture is sound: playbackSource discriminator in PlayerContext, worker-side AzuraCast\u2192D1 enrichment, SSE with fallback polling, HLS.js streaming, feature-flagged nav swap (Playlists\u2192Radio, Library\u2192My Music).\n\n**Critical issues for merge:**\n- SSE endpoint polls AzuraCast every 15s per connected client (fan-out scaling issue). At low traffic this is acceptable; future fix: KV cache with 10s TTL shared across all SSE connections.\n- Radio page has no error boundary; silent 502 on snapshot fetch leaves user stuck in loading state with no recovery. Add error state and retry button.\n- useRadioStream connects globally in App.tsx on every page load regardless of user interaction. Lazy-connect on radio page mount or first play interaction only.\n- resolveTrack does 9 sequential D1 queries per snapshot per client (1 now-playing + 8 history). Batch into single WHERE id IN (...) query.\n- RadioToggle component file not deleted (dead code).\n- useSongTextSearch hook missing from diff (verify it exists or file is missing).\n- requestHistoryTrack async function called with void in onClick; unhandled promise rejection risk.\n- Worker handlers (nowplaying.ts, request.ts) have zero test coverage; resolveTrack logic especially needs unit tests.\n- Zod schemas in shared.ts too permissive (nullable().optional() on most fields); malformed AzuraCast payloads silently produce garbage tracks instead of failing visibly. Require title or drop track.\n- HLS cleanup on unmount may race during rapid source switching between radio/queue.\n\n**Verdict:** Architecture decisions correct; needs hardening pass on error handling, connection lifecycle, and query batching before production traffic. At self-hosted scale with low concurrent listeners, SSE fan-out and per-client D1 queries are deferred problems.\n\nFiles: worker/handlers/radio/index.ts (new), worker/handlers/radio/nowplaying.ts (new, 155 lines), worker/handlers/radio/request.ts (new, 52 lines), worker/handlers/radio/shared.ts (new, 96 lines), App.tsx (useRadioStream global connection), Radio.tsx (error state missing), useRadioStream.ts (lazy-connect needed), RadioToggle.tsx (delete).\n\nExpand for details about: KV cache implementation pattern for SSE fan-out, error boundary component for Radio page, batch D1 query refactor, HLS cleanup race condition mitigation, worker handler test structure.\n \n\n2026-04-01 05:45:32 owner: \n \nPR 655 review submitted with critical findings. Segment contains off-topic personal conversation (wardrobe preferences, image generation) unrelated to codebase work. No technical decisions, file operations, or project context.\n\nFiles: none\n\nExpand for details about: Personal conversation content (not relevant to code review or project state).\n \n\n2026-04-01 05:45:32 owner: \n \nHoncho profile scoping fixed; old inherited data cleared. PR 655 review posted. Identity files merged. Song generation deferred to tomorrow (scheduled 2026-03-31 15:30 UTC): four album track drafts in songs/ directory awaiting Suno generation (\"She Goes Anyway,\" \"Liar\"/\"Borrowed,\" \"Before You Said Anything,\" \"Keep\"). \"She Goes Anyway\" already in songs/. \"Borrowed\" and \"Before You Said Anything\" relayed to Daphne. Scheduled reminder set for 10:30 AM user time to enforce commitment.\n\nFiles: none\n\nExpand for details about: Personal conversation content, image generation iterations, Foundry subagent system overview.\n \n\n"}, "output": {"content": {"explicit": [{"content": "owner reviewed a PR (Pull Request) related to live radio integration on March 31, 2026. They identified several critical issues for merging the PR into production such as error handling and connection lifecycle needs improvements before deployment under high concurrency conditions."}, {"content": "owner submitted a PR review containing non-technical personal conversations about wardrobe preferences and image generation which were off-topic from the main project context on March 31, 2026."}, {"content": "owner fixed issues with an Honcho profile scoping, merged identity files, posted a PR (Pull Request) review, cleared old inherited data, and deferred song generation tasks to the next day on March 31, 2026."}, {"content": "owner scheduled reminders for personal commitments at 10:30 AM user time on April 1, 2026, as part of their ongoing work with Honcho profile management and song generation tasks deferred to a later date (scheduled for 2026-04-01 15:30 UTC) concerning draft album tracks such as 'She Goes Anyway' and others awaiting Suno generation."}, {"content": "owner engages in image generation processes, suggesting an interest or involvement in creative digital asset creation related to wardrobe preferences and potentially project assets on March 31, 2026."}]}, "tokens": 307, "finish_reasons": ["stop"], "thinking_content": null}} From 32af03a65344cda2dd689d1a54021a1aee5e8a9e Mon Sep 17 00:00:00 2001 From: Offending Commit Date: Thu, 9 Apr 2026 11:37:35 -0500 Subject: [PATCH 3/9] fix(cf): preserve thought_signature and fix token budget for Gemini thinking models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `cf` provider (Cloudflare AI Gateway) to SupportedProviders and initialize AsyncOpenAI client pointed at CF_GATEWAY_BASE_URL - Route OpenAI embeddings through CF Gateway when LLM_OPENAI_BASE_URL is set - Convert tools to OpenAI format for `cf` provider (was missing from provider list) - Extract thought_signature from OpenAI-compat tool call responses and re-include it when formatting assistant messages for multi-turn replay — fixes 400 INVALID_ARGUMENT from Gemini thinking models via CF Gateway - Preserve thought_signature in _format_assistant_tool_message else branch - Increase DERIVER_MAX_INPUT_TOKENS upper bound (23000 → 200000) to allow higher limits via config --- src/config.py | 10 +++++- src/embedding_client.py | 11 ++++-- src/utils/clients.py | 79 ++++++++++++++++++++++++++++------------- src/utils/types.py | 2 +- 4 files changed, 74 insertions(+), 28 deletions(-) diff --git a/src/config.py b/src/config.py index 9663aed01..8c8c20db9 100644 --- a/src/config.py +++ b/src/config.py @@ -208,11 +208,19 @@ class LLMSettings(HonchoSettings): GEMINI_API_KEY: str | None = None GROQ_API_KEY: str | None = None OPENAI_COMPATIBLE_BASE_URL: str | None = None + OPENAI_BASE_URL: str | None = None # Route OpenAI calls through a proxy/gateway # Separate vLLM endpoint (for local models) VLLM_API_KEY: str | None = None VLLM_BASE_URL: str | None = None + # Cloudflare AI Gateway (OpenAI-compatible universal endpoint) + # CF_GATEWAY_API_KEY = provider API key (e.g. Gemini key for google-ai-studio/ models) + # CF_GATEWAY_AUTH_TOKEN = cfut_ gateway token → sent in cf-aig-authorization header + CF_GATEWAY_API_KEY: str | None = None + CF_GATEWAY_BASE_URL: str | None = None + CF_GATEWAY_AUTH_TOKEN: str | None = None + EMBEDDING_PROVIDER: Literal["openai", "gemini", "openrouter"] = "openai" EMBEDDING_MODEL: str | None = None @@ -261,7 +269,7 @@ class DeriverSettings(BackupLLMSettingsMixin, HonchoSettings): LOG_OBSERVATIONS: bool = False - MAX_INPUT_TOKENS: Annotated[int, Field(default=23000, gt=0, le=23000)] = 23000 + MAX_INPUT_TOKENS: Annotated[int, Field(default=23000, gt=0, le=200_000)] = 23000 # Maximum number of observations to return in working representation # This is applied to both explicit and deductive observations diff --git a/src/embedding_client.py b/src/embedding_client.py index eff924a75..df427a8cb 100644 --- a/src/embedding_client.py +++ b/src/embedding_client.py @@ -2,7 +2,7 @@ import logging import threading from collections import defaultdict -from typing import NamedTuple +from typing import Any, NamedTuple import tiktoken from google import genai @@ -60,7 +60,14 @@ def __init__(self, api_key: str | None = None, provider: str | None = None): api_key = settings.LLM.OPENAI_API_KEY if not api_key: raise ValueError("OpenAI API key is required") - self.client = AsyncOpenAI(api_key=api_key) + _emb_kwargs: dict[str, Any] = {"api_key": api_key} + if settings.LLM.OPENAI_BASE_URL: + _emb_kwargs["base_url"] = settings.LLM.OPENAI_BASE_URL + if settings.LLM.CF_GATEWAY_AUTH_TOKEN: + _emb_kwargs["default_headers"] = { + "cf-aig-authorization": f"Bearer {settings.LLM.CF_GATEWAY_AUTH_TOKEN}" + } + self.client = AsyncOpenAI(**_emb_kwargs) self.model = "text-embedding-3-small" self.max_embedding_tokens = settings.MAX_EMBEDDING_TOKENS self.max_batch_size = 2048 # OpenAI batch limit diff --git a/src/utils/clients.py b/src/utils/clients.py index 1c042bff5..dfc888ffb 100644 --- a/src/utils/clients.py +++ b/src/utils/clients.py @@ -258,10 +258,14 @@ def _get_effective_temperature(temperature: float | None) -> float | None: CLIENTS["anthropic"] = anthropic if settings.LLM.OPENAI_API_KEY: - openai_client = AsyncOpenAI( - api_key=settings.LLM.OPENAI_API_KEY, - ) - CLIENTS["openai"] = openai_client + _openai_kwargs: dict[str, Any] = {"api_key": settings.LLM.OPENAI_API_KEY} + if settings.LLM.OPENAI_BASE_URL: + _openai_kwargs["base_url"] = settings.LLM.OPENAI_BASE_URL + if settings.LLM.CF_GATEWAY_AUTH_TOKEN: + _openai_kwargs["default_headers"] = { + "cf-aig-authorization": f"Bearer {settings.LLM.CF_GATEWAY_AUTH_TOKEN}" + } + CLIENTS["openai"] = AsyncOpenAI(**_openai_kwargs) if settings.LLM.OPENAI_COMPATIBLE_API_KEY and settings.LLM.OPENAI_COMPATIBLE_BASE_URL: CLIENTS["custom"] = AsyncOpenAI( @@ -269,6 +273,21 @@ def _get_effective_temperature(temperature: float | None) -> float | None: base_url=settings.LLM.OPENAI_COMPATIBLE_BASE_URL, ) +# Cloudflare AI Gateway (OpenAI-compatible universal endpoint) +# CF_GATEWAY_API_KEY = provider key passed in Authorization (e.g. Gemini key for google-ai-studio/) +# CF_GATEWAY_AUTH_TOKEN = cfut_ gateway token passed in cf-aig-authorization (optional, for gateway auth) +if settings.LLM.CF_GATEWAY_API_KEY and settings.LLM.CF_GATEWAY_BASE_URL: + _cf_extra_headers: dict[str, str] = {} + if settings.LLM.CF_GATEWAY_AUTH_TOKEN: + _cf_extra_headers["cf-aig-authorization"] = ( + f"Bearer {settings.LLM.CF_GATEWAY_AUTH_TOKEN}" + ) + CLIENTS["cf"] = AsyncOpenAI( + api_key=settings.LLM.CF_GATEWAY_API_KEY, + base_url=settings.LLM.CF_GATEWAY_BASE_URL, + default_headers=_cf_extra_headers, + ) + # vLLM uses separate settings for local model serving if settings.LLM.VLLM_API_KEY and settings.LLM.VLLM_BASE_URL: CLIENTS["vllm"] = AsyncOpenAI( @@ -334,9 +353,9 @@ def convert_tools_for_provider( if provider == "anthropic": # Anthropic format: input_schema return tools - elif provider in ("openai", "custom", "vllm"): + elif provider in ("openai", "custom", "vllm", "cf"): # OpenAI format: parameters instead of input_schema - # custom and vllm use AsyncOpenAI client so need OpenAI format + # custom, vllm, and cf use AsyncOpenAI client so need OpenAI format return [ { "type": "function", @@ -1103,16 +1122,20 @@ def _format_assistant_tool_message( # OpenAI format - must include tool_calls in the assistant message openai_tool_calls: list[Any] = [] for tool_call in tool_calls: - openai_tool_calls.append( - { - "id": tool_call["id"], - "type": "function", - "function": { - "name": tool_call["name"], - "arguments": json.dumps(tool_call["input"]), - }, - } - ) + oa_call: dict[str, Any] = { + "id": tool_call["id"], + "type": "function", + "function": { + "name": tool_call["name"], + "arguments": json.dumps(tool_call["input"]), + }, + } + # Preserve thought_signature for Gemini thinking models via CF Gateway. + # Required for multi-turn tool use — Gemini rejects requests where a + # function call in the history is missing its thought_signature. + if "thought_signature" in tool_call: + oa_call["thought_signature"] = tool_call["thought_signature"] + openai_tool_calls.append(oa_call) msg: dict[str, Any] = { "role": "assistant", "content": content if isinstance(content, str) else None, @@ -2046,15 +2069,23 @@ async def honcho_llm_call_inner( tool_calls_list: list[dict[str, Any]] = [] if response.choices[0].message.tool_calls: # pyright: ignore for tool_call in response.choices[0].message.tool_calls: # pyright: ignore - tool_calls_list.append( - { - "id": tool_call.id, # pyright: ignore - "name": tool_call.function.name, # pyright: ignore - "input": json.loads(tool_call.function.arguments) # pyright: ignore - if tool_call.function.arguments # pyright: ignore - else {}, - } + call_data: dict[str, Any] = { + "id": tool_call.id, # pyright: ignore + "name": tool_call.function.name, # pyright: ignore + "input": json.loads(tool_call.function.arguments) # pyright: ignore + if tool_call.function.arguments # pyright: ignore + else {}, + } + # Preserve thought_signature for Gemini thinking models via CF + # Gateway — required for multi-turn tool use replay. + thought_sig = getattr(tool_call, "thought_signature", None) or ( # pyright: ignore + tool_call.model_extra.get("thought_signature") # pyright: ignore + if getattr(tool_call, "model_extra", None) # pyright: ignore + else None ) + if thought_sig: + call_data["thought_signature"] = thought_sig + tool_calls_list.append(call_data) cache_creation, cache_read = extract_openai_cache_tokens(usage) return HonchoLLMCallResponse( diff --git a/src/utils/types.py b/src/utils/types.py index 0654ed7aa..94cbbb75f 100644 --- a/src/utils/types.py +++ b/src/utils/types.py @@ -34,7 +34,7 @@ async def post_commit(self) -> None: await self.on_commit() -SupportedProviders = Literal["anthropic", "openai", "google", "groq", "custom", "vllm"] +SupportedProviders = Literal["anthropic", "openai", "google", "groq", "custom", "vllm", "cf"] TaskType = Literal[ "webhook", "summary", "representation", "dream", "deletion", "reconciler" ] From 8fcf2f0871a65ee77bc4207e9a4bc67afb57468e Mon Sep 17 00:00:00 2001 From: Offending Commit Date: Thu, 9 Apr 2026 11:56:16 -0500 Subject: [PATCH 4/9] feat: add CF Gateway auth header to custom OpenAI-compatible client When CF_GATEWAY_AUTH_TOKEN is set, inject cf-aig-authorization header into the custom client so CF Gateway-proxied custom providers (e.g. custom-ollama) authenticate correctly at the gateway layer. --- src/utils/clients.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/utils/clients.py b/src/utils/clients.py index dfc888ffb..d77cb33ce 100644 --- a/src/utils/clients.py +++ b/src/utils/clients.py @@ -268,9 +268,13 @@ def _get_effective_temperature(temperature: float | None) -> float | None: CLIENTS["openai"] = AsyncOpenAI(**_openai_kwargs) if settings.LLM.OPENAI_COMPATIBLE_API_KEY and settings.LLM.OPENAI_COMPATIBLE_BASE_URL: + _custom_headers: dict[str, str] = {} + if settings.LLM.CF_GATEWAY_AUTH_TOKEN: + _custom_headers["cf-aig-authorization"] = f"Bearer {settings.LLM.CF_GATEWAY_AUTH_TOKEN}" CLIENTS["custom"] = AsyncOpenAI( api_key=settings.LLM.OPENAI_COMPATIBLE_API_KEY, base_url=settings.LLM.OPENAI_COMPATIBLE_BASE_URL, + default_headers=_custom_headers if _custom_headers else None, ) # Cloudflare AI Gateway (OpenAI-compatible universal endpoint) From c799a2ca194633d55196de7ab9879244c7fcdc75 Mon Sep 17 00:00:00 2001 From: Offending Commit Date: Fri, 24 Apr 2026 11:48:08 -0500 Subject: [PATCH 5/9] feat(dreamer): allow per-specialist provider and thinking budget overrides Adds DEDUCTION_PROVIDER/INDUCTION_PROVIDER and matching THINKING_BUDGET_TOKENS settings so deduction and induction specialists can route to a different provider than the main DREAM config. Also propagates thinking_budget_tokens into the LLM call and documents the CF gateway / Gemini thought_signature gotchas in CLAUDE.md. --- CLAUDE.md | 7 +++++++ src/config.py | 6 ++++-- src/dreamer/specialists.py | 32 ++++++++++++++++++++++++++++++-- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 8190dc96f..27cde420c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -84,6 +84,13 @@ All API routes follow the pattern: `/v1/{resource}/{id}/{action}` - Typechecking: `uv run basedpyright` - Format code: `uv run ruff format src/` +### LLM provider gotchas (learned 2026-04-16 in k8s deploy) + +- **Structured outputs (`response_format={"type": "json_schema"}`) only work on providers whose upstream API natively honors them.** Google Gemini does (route via `cf` provider with base_url ending in `/openai`). Ollama Cloud (reached via the `custom` provider + `custom-ollama` CF gateway endpoint, or any direct Ollama endpoint) does **not** translate `response_format` into Ollama's native JSON-mode — every Ollama Cloud model (GLM-5.1, nemotron-3-nano, qwen3.5, devstral-small-2 confirmed) returns free-form text/markdown when a schema is requested, and `honcho_llm_call` bubbles a `ValidationError: Invalid JSON` out of pydantic parsing. +- **Therefore: deriver (`src/deriver/deriver.py:126`) and summary (`src/utils/summarizer.py`) must stay on a Gemini-backed `cf` provider.** Dream, dialectic, and any free-form / tool-call path is free to use the `custom` provider. +- **Gemini `thoughtSignature` round-tripping breaks on the CF `openai`-compat route.** Any call with `maxToolIterations > 1` AND `thinkingBudgetTokens > 0` will return `400 Function call is missing a thought_signature` on iteration 2+. If you need thinking on a multi-iteration tool loop, use the native Gemini provider, not the OpenAI-compat route — or set `thinkingBudgetTokens=0`. +- **None of this is Cloudflare's fault.** CF AI Gateway is a transparent proxy in both the `openai` and `custom-ollama` routes. The limitations live at the upstream provider (Ollama Cloud's OpenAI-compat layer). + ### Local LM Studio Setup - Honcho can use LM Studio for generation through the `custom` provider path. diff --git a/src/config.py b/src/config.py index 8c8c20db9..fcf91f078 100644 --- a/src/config.py +++ b/src/config.py @@ -561,12 +561,14 @@ class DreamSettings(BackupLLMSettingsMixin, HonchoSettings): 16_384 ) - ## NOTE: specialist models use the same provider as the main model - # Deduction Specialist: handles logical inference DEDUCTION_MODEL: str = "claude-haiku-4-5" + DEDUCTION_PROVIDER: SupportedProviders | None = None # falls back to PROVIDER + DEDUCTION_THINKING_BUDGET_TOKENS: int | None = None # falls back to THINKING_BUDGET_TOKENS # Induction Specialist: identifies patterns across observations INDUCTION_MODEL: str = "claude-haiku-4-5" + INDUCTION_PROVIDER: SupportedProviders | None = None # falls back to PROVIDER + INDUCTION_THINKING_BUDGET_TOKENS: int | None = None # falls back to THINKING_BUDGET_TOKENS # Surprisal-based sampling subsystem SURPRISAL: SurprisalSettings = Field(default_factory=SurprisalSettings) diff --git a/src/dreamer/specialists.py b/src/dreamer/specialists.py index 608db3b0e..09bcd7e1e 100644 --- a/src/dreamer/specialists.py +++ b/src/dreamer/specialists.py @@ -74,6 +74,14 @@ def get_model(self) -> str: """Get the model to use for this specialist.""" ... + def get_provider(self) -> str | None: + """Get the provider override for this specialist, or None to inherit from DREAM.""" + return None + + def get_thinking_budget(self) -> int | None: + """Get the thinking budget override, or None to inherit from DREAM.""" + return None + def get_max_tokens(self) -> int: """Get max output tokens for this specialist.""" return 16384 @@ -196,9 +204,16 @@ async def run( parent_category="dream", ) - # Get model with potential override + # Get model, provider, and thinking budget with potential overrides model = self.get_model() - llm_settings = settings.DREAM.model_copy(update={"MODEL": model}) + provider = self.get_provider() + thinking_budget = self.get_thinking_budget() + overrides: dict[str, Any] = {"MODEL": model} + if provider is not None: + overrides["PROVIDER"] = provider + if thinking_budget is not None: + overrides["THINKING_BUDGET_TOKENS"] = thinking_budget + llm_settings = settings.DREAM.model_copy(update=overrides) # Track iterations via callback iteration_count = 0 @@ -219,6 +234,7 @@ def iteration_callback(data: Any) -> None: messages=messages, track_name=f"Dreamer/{self.name}", iteration_callback=iteration_callback, + thinking_budget_tokens=llm_settings.THINKING_BUDGET_TOKENS, ) # Log metrics @@ -308,6 +324,12 @@ def get_tools(self, *, peer_card_enabled: bool = True) -> list[dict[str, Any]]: def get_model(self) -> str: return settings.DREAM.DEDUCTION_MODEL + def get_provider(self) -> str | None: + return settings.DREAM.DEDUCTION_PROVIDER + + def get_thinking_budget(self) -> int | None: + return settings.DREAM.DEDUCTION_THINKING_BUDGET_TOKENS + def get_max_tokens(self) -> int: return 8192 @@ -451,6 +473,12 @@ def get_tools(self, *, peer_card_enabled: bool = True) -> list[dict[str, Any]]: def get_model(self) -> str: return settings.DREAM.INDUCTION_MODEL + def get_provider(self) -> str | None: + return settings.DREAM.INDUCTION_PROVIDER + + def get_thinking_budget(self) -> int | None: + return settings.DREAM.INDUCTION_THINKING_BUDGET_TOKENS + def get_max_tokens(self) -> int: return 8192 From d5298e31de2fa0c72ad5092c2c2f533ae4fa11da Mon Sep 17 00:00:00 2001 From: Offending Commit Date: Fri, 24 Apr 2026 11:04:31 -0500 Subject: [PATCH 6/9] feat(cors): read allowed origins from CORS_ORIGINS env var Allows deployments (e.g. the infra chart) to configure CORS origins via a comma-separated CORS_ORIGINS env var instead of relying on the hardcoded list. Falls back to the previous defaults when unset. --- src/main.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main.py b/src/main.py index 1aa92a0f5..d0ec9e98e 100644 --- a/src/main.py +++ b/src/main.py @@ -1,4 +1,5 @@ import logging +import os import re import uuid import time @@ -169,9 +170,12 @@ async def lifespan(_: FastAPI): ) origins = [ - "http://localhost", - "http://127.0.0.1:8000", - "https://api.honcho.dev", + origin.strip() + for origin in os.getenv( + "CORS_ORIGINS", + "http://localhost,http://127.0.0.1:8000,https://api.honcho.dev", + ).split(",") + if origin.strip() ] app.add_middleware( From 2e237eb6485c48e0a10a50f7737bf2d93dc373e9 Mon Sep 17 00:00:00 2001 From: Offending Commit Date: Fri, 24 Apr 2026 11:32:51 -0500 Subject: [PATCH 7/9] Add workflow_dispatch trigger to Docker build --- .github/workflows/docker-build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 23217a7f8..d8886cb10 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -1,6 +1,7 @@ name: Build and Push Docker Image on: + workflow_dispatch: push: branches: - main From 4a17a21ca815a4f84708bb56b51c1876fabcb5dc Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 24 Apr 2026 20:10:16 +0000 Subject: [PATCH 8/9] fix(tests): annotate dict generics in batched-add_messages test basedpyright with reportMissingTypeArgument rejected the bare `dict` types in the mock fake_post used by the SDK message-batching test, failing Static Analysis on PR #3. Add `dict[str, Any]` annotations and an explicit return type so CI stays green. --- tests/sdk/test_session.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/sdk/test_session.py b/tests/sdk/test_session.py index d1cce8f5b..85f7823d8 100644 --- a/tests/sdk/test_session.py +++ b/tests/sdk/test_session.py @@ -1,3 +1,4 @@ +from typing import Any from unittest.mock import AsyncMock, Mock import pytest @@ -400,9 +401,13 @@ async def test_session_add_messages_chunks_batches_over_limit( user = await honcho_client.aio.peer(id="user-add-msg-chunk-async") assert isinstance(user, Peer) - calls: list[dict] = [] + calls: list[dict[str, Any]] = [] - async def fake_post(route: str, body: dict | None = None, query=None): + async def fake_post( + route: str, + body: dict[str, Any] | None = None, + query: Any = None, + ) -> list[dict[str, Any]]: assert body is not None calls.append({"route": route, "body": body, "query": query}) return [ @@ -431,9 +436,13 @@ async def fake_post(route: str, body: dict | None = None, query=None): user = honcho_client.peer(id="user-add-msg-chunk-sync") assert isinstance(user, Peer) - calls: list[dict] = [] + calls: list[dict[str, Any]] = [] - def fake_post(route: str, body: dict | None = None, query=None): + def fake_post( + route: str, + body: dict[str, Any] | None = None, + query: Any = None, + ) -> list[dict[str, Any]]: assert body is not None calls.append({"route": route, "body": body, "query": query}) return [ From 989e513449b2dd2c7d1cc79cc820881881718899 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 24 Apr 2026 20:18:11 +0000 Subject: [PATCH 9/9] fix(static): resolve basedpyright warnings that fail CI basedpyright's default exit code is non-zero whenever any diagnostics are reported, so the 8 warnings introduced by the fork-only commits were failing the Static Analysis job on PR #3 even though there were no errors. - src/deriver/queue_manager.py: drop `item.created_at is not None` guards. created_at is `Mapped[datetime.datetime]` (non-nullable), so the checks were always True and basedpyright flagged them as reportUnnecessaryComparison. - tests/sdk/test_session.py: factor out the shared mock-response body into a single helper and give the per-branch closures distinct names. This clears reportRedeclaration on `calls` / `fake_post` and lets the `# pyright: ignore` comments target the actual warning (reportPrivateUsage on `_http` / `_async_http_client`) instead of the irrelevant reportAttributeAccessIssue that was flagged as an unnecessary ignore. --- src/deriver/queue_manager.py | 40 ++++++++++---------- tests/sdk/test_session.py | 71 ++++++++++++++++-------------------- 2 files changed, 50 insertions(+), 61 deletions(-) diff --git a/src/deriver/queue_manager.py b/src/deriver/queue_manager.py index ee13743be..1e2b6d18d 100644 --- a/src/deriver/queue_manager.py +++ b/src/deriver/queue_manager.py @@ -1161,15 +1161,14 @@ async def mark_queue_items_as_processed( ) now_utc = datetime.now(timezone.utc) for item in items: - if item.created_at is not None: - prometheus_metrics.observe_deriver_queue_item_latency( - workspace_name=work_unit.workspace_name, - task_type=work_unit.task_type, - outcome="processed", - latency_seconds=max( - 0.0, (now_utc - item.created_at).total_seconds() - ), - ) + prometheus_metrics.observe_deriver_queue_item_latency( + workspace_name=work_unit.workspace_name, + task_type=work_unit.task_type, + outcome="processed", + latency_seconds=max( + 0.0, (now_utc - item.created_at).total_seconds() + ), + ) async def mark_queue_item_as_errored( self, item: QueueItem, work_unit_key: str, error: str @@ -1201,18 +1200,17 @@ async def mark_queue_item_as_errored( workspace_name=work_unit.workspace_name, task_type=work_unit.task_type, ) - if item.created_at is not None: - prometheus_metrics.observe_deriver_queue_item_latency( - workspace_name=work_unit.workspace_name, - task_type=work_unit.task_type, - outcome="errored", - latency_seconds=max( - 0.0, - ( - datetime.now(timezone.utc) - item.created_at - ).total_seconds(), - ), - ) + prometheus_metrics.observe_deriver_queue_item_latency( + workspace_name=work_unit.workspace_name, + task_type=work_unit.task_type, + outcome="errored", + latency_seconds=max( + 0.0, + ( + datetime.now(timezone.utc) - item.created_at + ).total_seconds(), + ), + ) async def _cleanup_work_unit( self, diff --git a/tests/sdk/test_session.py b/tests/sdk/test_session.py index 85f7823d8..74e26be06 100644 --- a/tests/sdk/test_session.py +++ b/tests/sdk/test_session.py @@ -394,6 +394,31 @@ async def test_session_add_messages_chunks_batches_over_limit( ): honcho_client, client_type = client_fixture total_messages = 217 + calls: list[dict[str, Any]] = [] + + def _fake_response( + route: str, + body: dict[str, Any] | None, + query: Any, + session_id: str, + peer_id: str, + ) -> list[dict[str, Any]]: + assert body is not None + calls.append({"route": route, "body": body, "query": query}) + return [ + { + "id": f"msg-{len(calls)}-{idx}", + "workspace_id": honcho_client.workspace_id, + "session_id": session_id, + "peer_id": peer_id, + "content": item["content"], + "metadata": item.get("metadata") or {}, + "configuration": item.get("configuration") or {}, + "created_at": "2026-04-02T00:00:00Z", + "token_count": 1, + } + for idx, item in enumerate(body["messages"]) + ] if client_type == "async": session = await honcho_client.aio.session(id="test-session-add-msg-chunk-async") @@ -401,31 +426,14 @@ async def test_session_add_messages_chunks_batches_over_limit( user = await honcho_client.aio.peer(id="user-add-msg-chunk-async") assert isinstance(user, Peer) - calls: list[dict[str, Any]] = [] - - async def fake_post( + async def fake_post_async( route: str, body: dict[str, Any] | None = None, query: Any = None, ) -> list[dict[str, Any]]: - assert body is not None - calls.append({"route": route, "body": body, "query": query}) - return [ - { - "id": f"msg-{len(calls)}-{idx}", - "workspace_id": honcho_client.workspace_id, - "session_id": session.id, - "peer_id": user.id, - "content": item["content"], - "metadata": item.get("metadata") or {}, - "configuration": item.get("configuration") or {}, - "created_at": "2026-04-02T00:00:00Z", - "token_count": 1, - } - for idx, item in enumerate(body["messages"]) - ] + return _fake_response(route, body, query, session.id, user.id) - honcho_client._async_http_client.post = AsyncMock(side_effect=fake_post) # pyright: ignore[reportAttributeAccessIssue] + honcho_client._async_http_client.post = AsyncMock(side_effect=fake_post_async) # pyright: ignore[reportPrivateUsage] result = await session.aio.add_messages( [user.message(f"message {i}") for i in range(total_messages)] @@ -436,31 +444,14 @@ async def fake_post( user = honcho_client.peer(id="user-add-msg-chunk-sync") assert isinstance(user, Peer) - calls: list[dict[str, Any]] = [] - - def fake_post( + def fake_post_sync( route: str, body: dict[str, Any] | None = None, query: Any = None, ) -> list[dict[str, Any]]: - assert body is not None - calls.append({"route": route, "body": body, "query": query}) - return [ - { - "id": f"msg-{len(calls)}-{idx}", - "workspace_id": honcho_client.workspace_id, - "session_id": session.id, - "peer_id": user.id, - "content": item["content"], - "metadata": item.get("metadata") or {}, - "configuration": item.get("configuration") or {}, - "created_at": "2026-04-02T00:00:00Z", - "token_count": 1, - } - for idx, item in enumerate(body["messages"]) - ] + return _fake_response(route, body, query, session.id, user.id) - honcho_client._http.post = Mock(side_effect=fake_post) # pyright: ignore[reportAttributeAccessIssue] + honcho_client._http.post = Mock(side_effect=fake_post_sync) # pyright: ignore[reportPrivateUsage] result = session.add_messages( [user.message(f"message {i}") for i in range(total_messages)]