From 60d2da4201c80f2eb76da8be2502df9d58dbdbcf Mon Sep 17 00:00:00 2001
From: Richard Wooding <richardwooding@Richards-Virtual-Machine.local>
Date: Tue, 28 Apr 2026 13:32:35 +0200
Subject: [PATCH] Add add-sql-dialect agent skill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Repo-local skill that captures the procedure for adding a seventh SQL
dialect to pycel2sql, mirroring the equivalent skill in cel2sql Go and
cel2sql4j Java but adapted to Python idioms (Dialect ABC, single-file
dialect modules, Lark-based converter).

Layout under .claude/skills/add-sql-dialect/:
- SKILL.md (~110 lines) — quick start, picking-the-analogue table,
  critical surface methods, capabilities methods, IndexAdvisor decision,
  doc refresh, verification.
- references/dialect-method-checklist.md — every method on the Dialect
  ABC grouped by category, with one-line "what to emit" guidance per
  method drawn from the six existing implementations.
- references/test-files.md — exhaustive file-by-file checklist (code,
  tests, docs; optional integration + introspect support).
- scripts/scaffold_dialect.py — copies an existing dialect file, renames
  the class, replaces every method body with NotImplementedError, prints
  the next manual steps. Refuses to overwrite. Walks up from __file__ to
  find the repo root, so it works from any cwd inside the repo.

Lints clean against .claude/skills/skill-authoring/scripts/lint_skill.py.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .claude/skills/add-sql-dialect/SKILL.md       | 114 +++++++++++
 .../references/dialect-method-checklist.md    | 118 +++++++++++
 .../add-sql-dialect/references/test-files.md  |  59 ++++++
 .../scripts/scaffold_dialect.py               | 183 ++++++++++++++++++
 4 files changed, 474 insertions(+)
 create mode 100644 .claude/skills/add-sql-dialect/SKILL.md
 create mode 100644 .claude/skills/add-sql-dialect/references/dialect-method-checklist.md
 create mode 100644 .claude/skills/add-sql-dialect/references/test-files.md
 create mode 100755 .claude/skills/add-sql-dialect/scripts/scaffold_dialect.py
diff --git a/.claude/skills/add-sql-dialect/SKILL.md b/.claude/skills/add-sql-dialect/SKILL.md
new file mode 100644
index 0000000..95e046e
--- /dev/null
+++ b/.claude/skills/add-sql-dialect/SKILL.md
@@ -0,0 +1,114 @@
+---
+name: add-sql-dialect
+description: Adds a new SQL dialect to pycel2sql by creating src/pycel2sql/dialect/<name>.py (subclass of the Dialect ABC), registering in the DialectName enum and the get_dialect() factory, threading new test cases through every parametrized test class, and updating the README badge grid and dialect-comparison tables. Use when porting a new database backend (Trino, Snowflake, ClickHouse, MS SQL, Athena, Oracle) or any new analytics engine.
+---
+
+# Add SQL Dialect
+
+Adding a SQL dialect is the largest contribution shape in this repo (~1500-line PR, ~18 file touches in lockstep). The pattern is well-established — six dialects already follow it (PostgreSQL, MySQL, SQLite, DuckDB, BigQuery, Apache Spark). This skill captures the procedure so the engineer can follow the template instead of reverse-engineering the layout from existing dialects.
+
+## Quick start
+
+```bash
+# 1. Pick the closest analogue (see "Picking the analogue" below).
+# 2. Scaffold by copying — the script stubs every Dialect ABC method.
+python .claude/skills/add-sql-dialect/scripts/scaffold_dialect.py duckdb cockroach Cockroach
+#                                                                  ^^^^^^^ template
+#                                                                          ^^^^^^^^^ folder/identifier
+#                                                                                    ^^^^^^^^^ class prefix
+
+# 3. Then:
+#    a. Fill SQL bodies in src/pycel2sql/dialect/cockroach.py (replace NotImplementedError stubs).
+#    b. Add SPARK = "spark" → COCKROACH = "cockroach" to DialectName in dialect/_base.py.
+#    c. Register CockroachDialect in dialect/__init__.py (_REGISTRY + __all__).
+#    d. Export from src/pycel2sql/__init__.py.
+#    e. Add cockroach_dialect fixture + CockroachDialect() to ALL_DIALECTS in tests/conftest.py.
+#    f. Add to tests/test_dialect_parametrized.py ALL_DIALECTS list.
+#    g. Create tests/test_cockroach.py mirroring tests/test_duckdb.py shape.
+#    h. Update README badge grid + dialect count + comparison table; bump CLAUDE.md.
+#    i. Run: uv run ruff check src/ tests/ && uv run pytest tests/ --ignore=tests/integration
+```
+
+## Picking the analogue
+
+Decide by which existing dialect's syntax shape your target most resembles:
+
+| Question | Operator-style → use DuckDB | Function-style → use BigQuery |
+|---|---|---|
+| Regex match | `target ~ 'p'` (Postgres, DuckDB) | `REGEXP_CONTAINS(target, 'p')` (BigQuery); `target RLIKE 'p'` (Spark) |
+| JSON access | `b->>'f'` (Postgres, DuckDB, MySQL) | `JSON_VALUE(b, '$.f')` (BigQuery); `get_json_object(b, '$.f')` (Spark); `json_extract(b, '$.f')` (SQLite) |
+| Array literal | `ARRAY[…]` (Postgres); `[…]` (DuckDB, BigQuery) | `array(…)` (Spark) |
+| Array index | 1-indexed (Postgres, DuckDB) | 0-indexed (BigQuery via `OFFSET`, Spark direct) |
+| Param placeholder | `$N` (Postgres, DuckDB) | `?` (MySQL, SQLite, Spark) or `@pN` (BigQuery) |
+| Cast to numeric | `::numeric` postfix (Postgres) | `+ 0` arithmetic coercion (MySQL, SQLite, Spark); `CAST(... AS FLOAT64)` (BigQuery) |
+| Format function | `FORMAT('...', ...)` (Postgres, BigQuery) | `printf('...', ...)` (SQLite, DuckDB); `format_string('...', ...)` (Spark); raises (MySQL) |
+
+For the full Dialect-method-by-method matrix across the existing six dialects, see [references/dialect-method-checklist.md](references/dialect-method-checklist.md). When in doubt, copy DuckDB and patch — its layout is the cleanest.
+
+## Critical surface
+
+These methods on the `Dialect` ABC (`src/pycel2sql/dialect/_base.py`) are where dialects diverge most. Plan how to implement them before writing any code:
+
+- `write_regex_match` — operator vs function call vs `RLIKE`.
+- `write_json_field_access` — operator (`->>`) vs function wrapper (`JSON_VALUE`, `get_json_object`); whether intermediate vs final access uses different forms (Postgres `->` vs `->>`; Spark uses the same function for both).
+- `write_array_literal_open` / `write_array_literal_close` — `ARRAY[`, `[`, `array(`.
+- `write_list_index` / `write_list_index_const` — 0-indexed vs 1-indexed; bare `[i]` vs `[OFFSET(i)]` vs `+ 1`.
+- `write_param_placeholder` — `$N`, `?`, `@pN`. Positional `?` dialects ignore the index argument.
+- `write_extract` for DOW — Sunday=1 (BigQuery, Spark) vs Sunday=0 (Postgres/DuckDB convention) — adjust by `(dayofweek(t) - 1)` etc.
+- `write_cast_to_numeric` — postfix `::TYPE` vs arithmetic coercion `+ 0` vs `CAST(... AS NUMERIC)`.
+- `write_json_array_elements` — must be a **set-returning expression** (used in `FROM <here> AS iter`); use the engine's `EXPLODE` / `UNNEST` / `json_each` / `from_json` form.
+- `write_json_array_membership` / `write_nested_json_array_membership` — must produce a valid RHS for `lhs = ` (subquery form, like SQLite's `(SELECT value FROM json_each(...))`). If your engine cannot construct a boolean predicate without the candidate element, raise `UnsupportedDialectFeatureError` (mirrors `SparkDialect`).
+- `write_format` — per-dialect format() dispatch added in PR #8. Pick `FORMAT(...)`, `printf(...)`, `format_string(...)`, or raise.
+
+## Capabilities methods are not just informational
+
+The four `supports_*()` methods on `Dialect` drive Converter routing. Set them honestly:
+
+```python
+def supports_native_arrays(self) -> bool: return True
+def supports_jsonb(self) -> bool: return False  # Postgres-style JSONB only
+```
+
+## Optional: IndexAdvisor
+
+Implement the `IndexAdvisor` Protocol (in `dialect/_base.py`) only if the engine has user-controllable indexes (BTREE, GIN, ART, CLUSTERING). Skip for storage-layer-driven engines like Spark (Delta Z-order, Iceberg sort) — `get_index_advisor()` returns `None` for non-`IndexAdvisor` dialects, which gives an empty recommendation list (the right semantic for "no SQL-level recommendations").
+
+## Doc refresh
+
+When the implementation is green, refresh:
+
+- `README.md` — bump dialect count (currently "Six SQL dialects"), add badge after the existing six in the badge grid, add column to the comparison table near the placeholder list, add row to the introspect-supported list (only if you also add an introspect module under `src/pycel2sql/introspect/`).
+- `CLAUDE.md` — bump the dialect count near line 7, add a bullet under "Dialect Differences", append `dialect/<name>.py` to the dialect-files list.
+
+The full file-by-file checklist is in [references/test-files.md](references/test-files.md).
+
+## Verification
+
+```bash
+# Lint
+uv run ruff check src/ tests/
+
+# Type check (lark generic-arg notes are pre-existing — see CLAUDE.md)
+uv run mypy src/pycel2sql/
+
+# Unit tests — must pass for the new dialect plus all six existing ones
+uv run pytest tests/ --ignore=tests/integration -v
+
+# Optional integration (if you add Docker fixtures in tests/integration/conftest.py)
+uv pip install -e ".[integration]"
+uv run pytest tests/integration/ -v -k <dialect>
+
+# Skill lint
+python .claude/skills/skill-authoring/scripts/lint_skill.py .claude/skills/add-sql-dialect/
+```
+
+The Dialect ABC is enforced at instantiation time — calling `<New>Dialect()` with any abstract method missing raises `TypeError: Can't instantiate abstract class`. CI's `tests/conftest.py` instantiates every dialect in `ALL_DIALECTS`, so a missing method is caught immediately.
+
+## Scripts
+
+- **Run** `python .claude/skills/add-sql-dialect/scripts/scaffold_dialect.py <template> <new-name> <NewClassPrefix>` — copies an existing dialect file, renames the class to `<NewClassPrefix>Dialect`, replaces every method body with a `raise NotImplementedError(...)` stub, and prints the list of files created plus the next manual steps. Does not register the dialect anywhere — that's left to the engineer to do consciously.
+
+## References
+
+- [references/dialect-method-checklist.md](references/dialect-method-checklist.md) — every method on the `Dialect` ABC grouped by category, with one-line "what to emit" guidance per method drawn from the six existing implementations.
+- [references/test-files.md](references/test-files.md) — exhaustive file-by-file checklist for a new dialect (code, tests, docs).
diff --git a/.claude/skills/add-sql-dialect/references/dialect-method-checklist.md b/.claude/skills/add-sql-dialect/references/dialect-method-checklist.md
new file mode 100644
index 0000000..a468a34
--- /dev/null
+++ b/.claude/skills/add-sql-dialect/references/dialect-method-checklist.md
@@ -0,0 +1,118 @@
+# Dialect Method Checklist
+
+Every abstract method on the `Dialect` ABC (`src/pycel2sql/dialect/_base.py`), grouped by category, with one-line "what to emit" guidance per method drawn from the six existing implementations.
+
+## Contents
+
+- Literals
+- Operators
+- Type casting
+- Arrays
+- JSON
+- Timestamps
+- String functions
+- Comprehensions
+- Regex
+- Struct
+- Validation
+- Capabilities
+
+## Literals
+
+| Method | What to emit | Examples |
+|---|---|---|
+| `write_string_literal(w, value)` | Single-quoted string with `''` escaping (or `\\'` for BigQuery). | Postgres/DuckDB/MySQL: `'foo''bar'`. BigQuery: `'foo\'bar'`. |
+| `write_bytes_literal(w, value)` | Hex-encoded byte literal in the engine's preferred form. | Postgres: `'\\x...'`. SQLite/Spark: `X'...'`. BigQuery: `b"..."` form. |
+| `write_param_placeholder(w, param_index)` | Numbered or positional placeholder. | Postgres/DuckDB: `$N`. BigQuery: `@pN`. MySQL/SQLite/Spark: `?` (index ignored). |
+
+## Operators
+
+| Method | What to emit |
+|---|---|
+| `write_string_concat(w, write_lhs, write_rhs)` | Engine's concat form. Postgres/DuckDB: `lhs \|\| rhs`. MySQL: `CONCAT(lhs, rhs)`. SQLite: `lhs \|\| rhs`. BigQuery: `CONCAT(lhs, rhs)`. Spark: `concat(lhs, rhs)`. |
+| `write_regex_match(w, write_target, pattern, case_insensitive)` | Operator or function call. Postgres: `target ~ 'p'` / `~* 'p'`. DuckDB: `regexp_matches(target, 'p')`. MySQL: `target REGEXP 'p'`. BigQuery: `REGEXP_CONTAINS(target, 'p')`. Spark: `target RLIKE 'p'`. SQLite: raises (no portable regex). |
+| `write_like_escape(w)` | The trailing `ESCAPE` clause for `LIKE`. Postgres/DuckDB: ` ESCAPE '\\'`. SQLite: ` ESCAPE '\\'`. MySQL: ` ESCAPE '\\\\'`. BigQuery: empty (no ESCAPE supported). Spark: ` ESCAPE '\\\\'`. |
+| `write_array_membership(w, write_elem, write_array)` | `elem` membership in array. Postgres: `elem = ANY(array)`. DuckDB: `elem = ANY(array)`. BigQuery: `elem IN UNNEST(array)`. Spark: `array_contains(array, elem)` — note arg-order swap. MySQL/SQLite: emit through JSON-array path (no native arrays). |
+
+## Type casting
+
+| Method | What to emit |
+|---|---|
+| `write_cast_to_numeric(w, write_expr)` | Force string→number coercion. Postgres: `expr::numeric`. DuckDB: `expr::DOUBLE`. BigQuery: `CAST(expr AS FLOAT64)`. MySQL/SQLite/Spark: `expr + 0` (arithmetic coercion). |
+| `write_type_name(w, cel_type_name)` | Engine type name for explicit casts. Postgres: lowercase (`bigint`, `double precision`). MySQL: uppercase (`SIGNED`, `DOUBLE`). BigQuery: `BIGNUMERIC`/`FLOAT64`. Spark: `BIGINT`/`DOUBLE`/`STRING`. |
+| `write_epoch_extract(w, write_expr)` | `int(timestamp)` → epoch seconds. Postgres: `EXTRACT(EPOCH FROM expr)::bigint`. DuckDB: `EXTRACT(EPOCH FROM expr)::BIGINT`. MySQL: `UNIX_TIMESTAMP(expr)`. BigQuery: `UNIX_SECONDS(expr)`. Spark: `UNIX_TIMESTAMP(expr)`. SQLite: `CAST(strftime('%s', expr) AS INTEGER)`. |
+| `write_timestamp_cast(w, write_expr)` | `timestamp(string)`. Postgres/DuckDB: `CAST(expr AS TIMESTAMPTZ)`. MySQL: `CAST(expr AS DATETIME)`. BigQuery: `CAST(expr AS TIMESTAMP)`. Spark: `CAST(expr AS TIMESTAMP)`. SQLite: `datetime(expr)`. |
+
+## Arrays
+
+| Method | What to emit |
+|---|---|
+| `write_array_literal_open(w)` / `write_array_literal_close(w)` | Open/close array literal. Postgres: `ARRAY[` / `]`. DuckDB/BigQuery: `[` / `]`. Spark: `array(` / `)`. MySQL: `JSON_ARRAY(` / `)`. SQLite: `json_array(` / `)`. |
+| `write_array_length(w, dimension, write_expr)` | Length, NULL-safe. Wrap in `COALESCE(..., 0)` — every existing dialect does this. Multi-dim raises `UnsupportedDialectFeatureError` for engines without portable multi-dim length (Spark). |
+| `write_list_index(w, write_array, write_index)` | Dynamic index. 1-indexed engines (Postgres, DuckDB, MySQL, SQLite): emit `arr[idx + 1]`. 0-indexed (BigQuery): `arr[OFFSET(idx)]`. Spark: `arr[idx]` (0-indexed direct). |
+| `write_list_index_const(w, write_array, index)` | Constant-int index — same shapes as `write_list_index` with the integer baked in. |
+| `write_empty_typed_array(w, type_name)` | Empty typed array literal for `split(s, d, 0)` etc. Postgres: `ARRAY[]::<type>[]`. DuckDB: `[]::<type>[]`. BigQuery: `ARRAY<<type>>[]`. Spark: `CAST(array() AS ARRAY<<type>>)`. |
+
+## JSON
+
+| Method | What to emit |
+|---|---|
+| `write_json_field_access(w, write_base, field_name, is_final)` | Access a JSON field. Postgres/DuckDB: `base->'field'` (intermediate) / `base->>'field'` (final). MySQL: `base->>'$.field'` (always text). BigQuery: `JSON_QUERY(base, '$.field')` / `JSON_VALUE(base, '$.field')`. Spark: `get_json_object(base, '$.field')` (single function for both). SQLite: `json_extract(base, '$.field')`. |
+| `write_json_existence(w, is_jsonb, field_name, write_base)` | `has(base.field)`. Postgres JSONB: `base ? 'field'`. Postgres JSON: `base->>'field' IS NOT NULL`. Others: `<extract> IS NOT NULL`. |
+| `write_json_array_elements(w, is_jsonb, as_text, write_expr)` | Set-returning expression for `FROM <here>` in comprehensions. Postgres: `jsonb_array_elements_text(expr)`. DuckDB: `json_each(expr)` style. BigQuery: `UNNEST(JSON_QUERY_ARRAY(expr))`. Spark: `EXPLODE(from_json(expr, 'ARRAY<STRING>'))`. SQLite: `json_each(expr)`. |
+| `write_json_array_length(w, write_expr)` | NULL-safe length of a JSON array column. **Wrap in `COALESCE(..., 0)`** — every dialect does this; the BigQuery wrap was added in PR #8 to match. |
+| `write_json_array_membership(w, json_func, write_expr)` | RHS for `lhs = <subquery>` in comprehensions. SQLite: `(SELECT value FROM json_each(expr))`. Spark: raises (no portable boolean-predicate form available without candidate element). |
+| `write_nested_json_array_membership(w, write_expr)` | Same as above but for nested chains. |
+
+## Timestamps
+
+| Method | What to emit |
+|---|---|
+| `write_duration(w, value, unit)` | Constant duration literal. Postgres/DuckDB: `INTERVAL 'N unit'`. MySQL/SQLite: dialect-specific INTERVAL syntax. Spark: `INTERVAL N unit`. BigQuery: `INTERVAL N unit`. |
+| `write_interval(w, write_value, unit)` | Dynamic-value INTERVAL. Same shapes as above with the value coming from a callback. |
+| `write_extract(w, part, write_expr, write_tz)` | `EXTRACT(part FROM expr)`. **DOW special case**: Sunday=1 (BigQuery, Spark) vs Sunday=0 (Postgres convention). Adjust with `(dayofweek(expr) - 1)` (Spark) or modulo arithmetic (BigQuery). |
+| `write_timestamp_arithmetic(w, op, write_ts, write_dur)` | `timestamp +/- duration`. Postgres/DuckDB: `ts op dur`. BigQuery: `TIMESTAMP_ADD(ts, dur)` / `TIMESTAMP_SUB(...)`. MySQL: `DATE_ADD(...)` / `DATE_SUB(...)`. SQLite: `datetime(ts, '<sign>N unit')`. Spark: `ts op dur`. |
+
+## String functions
+
+| Method | What to emit |
+|---|---|
+| `write_contains(w, write_haystack, write_needle)` | `haystack.contains(needle)` → boolean. Postgres: `POSITION(needle IN haystack) > 0`. DuckDB: `CONTAINS(haystack, needle)`. MySQL: `LOCATE(needle, haystack) > 0`. BigQuery: `STRPOS(haystack, needle) > 0`. Spark: `LOCATE(needle, haystack) > 0`. SQLite: `INSTR(haystack, needle) > 0`. |
+| `write_split(w, write_str, write_delim)` | Split into array. Postgres: `STRING_TO_ARRAY(s, d)`. DuckDB: `STRING_SPLIT(s, d)`. BigQuery: `SPLIT(s, d)`. Spark: `split(s, d)`. MySQL: `JSON_ARRAY(s)` (cannot split into a SQL array; emits a single-element JSON array). SQLite: raises. |
+| `write_split_with_limit(w, write_str, write_delim, limit)` | 3-arg split. Spark/Postgres-style: `split(s, d, limit)` or 2-arg + slice. BigQuery: `SPLIT(...)` with `WHERE OFFSET < limit`. |
+| `write_join(w, write_array, write_delim)` | Array → string. Postgres/DuckDB: `ARRAY_TO_STRING(arr, delim, '')`. BigQuery: `ARRAY_TO_STRING(arr, delim)`. Spark: `array_join(arr, delim)`. MySQL: `JSON_UNQUOTE(arr)` (no-op fallback). SQLite: raises. |
+| `write_format(w, fmt_string, write_args)` | `string.format([args])`. Postgres/BigQuery: `FORMAT('fmt', ...)`. SQLite/DuckDB: `printf('fmt', ...)`. Spark: `format_string('fmt', ...)`. MySQL: raises `UnsupportedDialectFeatureError` (no equivalent). |
+
+## Comprehensions
+
+| Method | What to emit |
+|---|---|
+| `write_unnest(w, write_source)` | Set-returning expression for `FROM <here>`. Postgres/DuckDB/BigQuery: `UNNEST(source)`. Spark: `EXPLODE(source)`. MySQL: `JSON_TABLE(source, '$[*]' COLUMNS(...))`. SQLite: `json_each(source)`. |
+| `write_array_subquery_open(w)` | Opens an `ARRAY(SELECT ...)` wrapper for `map()` / `filter()`. Postgres/DuckDB: `ARRAY(SELECT `. BigQuery: `ARRAY(SELECT `. Spark: `(SELECT collect_list(` (different — `collect_list` aggregator). MySQL/SQLite: subquery scaffolding. |
+| `write_array_subquery_expr_close(w)` | Closes the inner expression before the FROM clause. Postgres/DuckDB: `` (no-op). Spark: `)` (closes `collect_list`). |
+
+## Regex
+
+| Method | What to emit |
+|---|---|
+| `convert_regex(re2_pattern)` | Validate RE2 pattern + convert to engine-native form. Returns `(pattern, case_insensitive)`. Postgres/DuckDB/Spark: passthrough after ReDoS validators. MySQL: convert to MySQL POSIX form. SQLite: not called (regex unsupported). |
+
+## Struct
+
+| Method | What to emit |
+|---|---|
+| `write_struct_open(w)` / `write_struct_close(w)` | Struct/record literal opener and closer. Postgres: `ROW(` / `)`. DuckDB: `{` / `}` (struct literal). BigQuery: `STRUCT(` / `)`. Spark: `struct(` / `)`. MySQL/SQLite: `JSON_OBJECT(` / `)` or similar. |
+
+## Validation
+
+| Method | What to emit |
+|---|---|
+| `max_identifier_length()` | Engine's identifier length limit. Postgres/MySQL: 63/64. BigQuery: 1024. Spark: 128. SQLite: no limit (returns 0). |
+| `validate_field_name(name)` | Raise `InvalidFieldNameError` for invalid names. Should check empty, length, regex, reserved-keyword set. |
+
+## Capabilities
+
+| Method | What to emit |
+|---|---|
+| `supports_native_arrays()` | True for Postgres/DuckDB/BigQuery/Spark; False for MySQL/SQLite (use JSON arrays). |
+| `supports_jsonb()` | True for Postgres only (JSONB-specific behaviour like `?` operator). False everywhere else. |
diff --git a/.claude/skills/add-sql-dialect/references/test-files.md b/.claude/skills/add-sql-dialect/references/test-files.md
new file mode 100644
index 0000000..aefdbc9
--- /dev/null
+++ b/.claude/skills/add-sql-dialect/references/test-files.md
@@ -0,0 +1,59 @@
+# Test Files Touchpoints
+
+Exhaustive file-by-file checklist for landing a new dialect. The dialect is wired correctly only when every file in this list is updated.
+
+## Contents
+
+- Code
+- Tests
+- Docs
+- Optional: integration tests
+- Optional: introspect support
+
+## Code
+
+| File | Change |
+|---|---|
+| `src/pycel2sql/dialect/<name>.py` | New file. The class + module-level helpers (`_<NAME>_RESERVED`, regex validators, type maps). |
+| `src/pycel2sql/dialect/_base.py` | Add `<NAME> = "<name>"` to the `DialectName` enum (alphabetical or trailing — both existing dialects show both patterns). |
+| `src/pycel2sql/dialect/__init__.py` | (a) Add `from pycel2sql.dialect.<name> import <Name>Dialect`. (b) Add `"<Name>Dialect"` to `__all__`. (c) Add `DialectName.<NAME>: <Name>Dialect` to `_REGISTRY`. |
+| `src/pycel2sql/__init__.py` | (a) Add `from pycel2sql.dialect.<name> import <Name>Dialect`. (b) Add `"<Name>Dialect"` to `__all__`. |
+
+## Tests
+
+| File | Change |
+|---|---|
+| `tests/conftest.py` | Add `@pytest.fixture` named `<name>_dialect` returning `<Name>Dialect()`. Append `<Name>Dialect()` to `ALL_DIALECTS`. |
+| `tests/test_dialect_parametrized.py` | Append `pytest.param(<Name>Dialect(), id="<name>")` to the local `ALL_DIALECTS` list. The parametrized tests cover universal SQL only (null/bool/logic/arithmetic/ternary/comparisons/negation), so no skip marks should be needed. If they are, the dialect's universal-SQL handling has a gap — fix the dialect, not the test. |
+| `tests/test_<name>.py` | New file. Mirror the `tests/test_duckdb.py` structure: `TestXxxLiterals`, `TestXxxParams`, `TestXxxArrays`, `TestXxxStringFunctions`, `TestXxxRegex`, `TestXxxTimestamps`, `TestXxxJSON`, `TestXxxValidation`, `TestXxxTypeCasting`, `TestXxxComprehensions`, `TestXxxStructs`. Aim for 30–50 cases. |
+
+If you add a method to the `Dialect` ABC as part of this dialect, every existing dialect file plus the dialect-method-checklist reference (in the `add-cel-feature` skill) must also be updated.
+
+## Docs
+
+| File | Change |
+|---|---|
+| `README.md` | (a) Bump dialect-count phrasing in the intro (currently "Six SQL dialects"). (b) Add a badge after the existing six in the badge grid. (c) Add an entry in the "Dialects" code example showing `get_dialect("<name>")`. (d) Add a row to the placeholder-style table (`?` / `$N` / `@pN`). (e) If introspect support is added (see below), update the introspect-supported list. |
+| `CLAUDE.md` | (a) Bump the "five → six" / "six → seven" count near line 7. (b) Append the new dialect file to the `dialect/{...}.py` list. (c) Add a one-line bullet under "Dialect Differences" describing the dialect's params/arrays/regex/JSON/format conventions. |
+
+## Optional: integration tests
+
+`tests/integration/` runs generated SQL against real databases via testcontainers. The test runner adapts conftest fixtures per dialect.
+
+| File | Change |
+|---|---|
+| `tests/integration/conftest.py` | Add a Docker fixture for the dialect's container (port, image tag, health check). Look at the existing Postgres/MySQL fixtures for the pattern. |
+| `tests/integration/test_*.py` | The shared integration suite parametrizes over connection fixtures. If the dialect's connector library has a different cursor / parameter-style API, you may need a thin adapter. |
+
+Apache Spark integration tests are deliberately deferred (see PR #8) — heavy testcontainers dependency. Same applies for new dialects until there's a working containerized target.
+
+## Optional: introspect support
+
+If you want users to be able to auto-discover schemas from a live connection:
+
+| File | Change |
+|---|---|
+| `src/pycel2sql/introspect/<name>.py` | New file implementing `introspect_<name>(connection, ...) → dict[str, Schema]`. Mirror `introspect/postgres.py` for relational engines, `introspect/sqlite.py` for engines that need column-list parsing from a `PRAGMA`-style command. |
+| `src/pycel2sql/introspect/__init__.py` | Re-export the new function. |
+| `tests/test_introspect.py` | Add a unit test mocking the connection and asserting the parsed Schema. |
+| `tests/integration/test_introspect.py` | Add an integration test against the real container — only if the dialect has integration tests already. |
diff --git a/.claude/skills/add-sql-dialect/scripts/scaffold_dialect.py b/.claude/skills/add-sql-dialect/scripts/scaffold_dialect.py
new file mode 100755
index 0000000..8b242d4
--- /dev/null
+++ b/.claude/skills/add-sql-dialect/scripts/scaffold_dialect.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""Scaffold a new pycel2sql dialect by copying an existing one and stubbing it.
+
+Usage:
+    python scaffold_dialect.py <template> <new-name> <NewClassPrefix>
+
+Example:
+    python scaffold_dialect.py duckdb cockroach Cockroach
+
+What it does:
+    1. Reads src/pycel2sql/dialect/<template>.py (the analogue dialect).
+    2. Renames the class to <NewClassPrefix>Dialect.
+    3. Replaces every method body with `raise NotImplementedError(...)`.
+    4. Writes to src/pycel2sql/dialect/<new-name>.py (refuses to overwrite).
+    5. Prints the next manual steps (DialectName, _REGISTRY, tests, docs).
+
+What it does NOT do:
+    - Register the dialect anywhere (you do that consciously).
+    - Touch tests/ or README/CLAUDE.md.
+    - Create scripts in scripts/ directories — only the dialect file.
+
+This script is intended to be run from the pycel2sql repo root.
+"""
+from __future__ import annotations
+
+import re
+import sys
+from pathlib import Path
+
+REPO_ROOT_HINT = "src/pycel2sql/dialect"
+
+
+def find_repo_root(start: Path) -> Path:
+    """Walk up to find a directory that contains src/pycel2sql/dialect."""
+    cur = start.resolve()
+    while cur != cur.parent:
+        if (cur / REPO_ROOT_HINT).is_dir():
+            return cur
+        cur = cur.parent
+    raise SystemExit(
+        f"could not locate {REPO_ROOT_HINT}/ above {start}; "
+        "run this script from inside the pycel2sql repo"
+    )
+
+
+def stub_method_bodies(src: str, template_class: str, new_class: str) -> str:
+    """Rewrite every method body to `raise NotImplementedError(...)`.
+
+    Module-level helpers (the regex / validation / type-map blocks above the
+    class) are kept verbatim — the engineer will adapt them by hand.
+    """
+    # Split into module-level prefix and the class block.
+    class_re = re.compile(rf"^class {re.escape(template_class)}\(Dialect\):", re.M)
+    m = class_re.search(src)
+    if not m:
+        raise SystemExit(
+            f"template class `{template_class}` not found in source — "
+            "did you pass the right template name?"
+        )
+
+    prefix = src[: m.start()]
+    body = src[m.start() :]
+
+    # Rename the class header.
+    body = body.replace(
+        f"class {template_class}(Dialect):",
+        f"class {new_class}(Dialect):",
+        1,
+    )
+
+    # Walk methods and replace their bodies. A method starts at a line of the
+    # form `    def name(...)` and continues until the next 4-space-indented
+    # `def ` or end-of-file.
+    method_header_re = re.compile(r"^(    def [^(]+\([^)]*\)(?:\s*->\s*[^:]+)?:\s*$)", re.M)
+    headers = list(method_header_re.finditer(body))
+
+    out: list[str] = []
+    last_end = 0
+    for i, hm in enumerate(headers):
+        out.append(body[last_end : hm.end()])
+        # body of this method ends at the next method header or EOF
+        next_start = headers[i + 1].start() if i + 1 < len(headers) else len(body)
+        # Extract the method name for the NotImplementedError message.
+        name_m = re.search(r"def\s+(\w+)\s*\(", hm.group(1))
+        method_name = name_m.group(1) if name_m else "?"
+        # Build the stubbed body. Preserve any leading docstring if it's already present.
+        block = body[hm.end() : next_start]
+        docstring_m = re.match(r'^\s*("""[^"]*?"""|\'\'\'[^\']*?\'\'\')\s*\n', block)
+        stub_indent = "        "
+        new_block_lines: list[str] = []
+        new_block_lines.append("\n")
+        if docstring_m:
+            new_block_lines.append(stub_indent + docstring_m.group(1) + "\n")
+        new_block_lines.append(
+            stub_indent
+            + f"raise NotImplementedError("
+            f'"{new_class}.{method_name}() not implemented yet")\n'
+        )
+        new_block_lines.append("\n")
+        out.append("".join(new_block_lines))
+        last_end = next_start
+
+    if not headers:
+        # No methods found — class body is unusual; bail out so we don't over-write.
+        raise SystemExit(
+            f"no methods found in class `{template_class}`; "
+            "scaffold script is confused — copy by hand instead"
+        )
+
+    return prefix + "".join(out)
+
+
+def main(argv: list[str]) -> int:
+    if len(argv) != 4:
+        print(__doc__)
+        return 2
+
+    template, new_name, new_prefix = argv[1], argv[2], argv[3]
+    if not re.fullmatch(r"[a-z][a-z0-9_]*", new_name):
+        print(f"new-name must be lowercase identifier, got {new_name!r}")
+        return 2
+    if not re.fullmatch(r"[A-Z][A-Za-z0-9]*", new_prefix):
+        print(f"NewClassPrefix must be CamelCase starting with uppercase, got {new_prefix!r}")
+        return 2
+
+    here = Path(__file__).resolve().parent
+    repo_root = find_repo_root(here)
+    dialect_dir = repo_root / REPO_ROOT_HINT
+
+    template_path = dialect_dir / f"{template}.py"
+    new_path = dialect_dir / f"{new_name}.py"
+
+    if not template_path.exists():
+        print(f"template not found: {template_path}")
+        print(f"available: {sorted(p.stem for p in dialect_dir.glob('*.py') if not p.stem.startswith('_'))}")
+        return 1
+    if new_path.exists():
+        print(f"refusing to overwrite existing {new_path}")
+        return 1
+
+    template_class = f"{template[0].upper()}{template[1:]}Dialect"
+    # special-cases that don't follow the naming convention
+    template_class = {
+        "postgres": "PostgresDialect",
+        "duckdb": "DuckDBDialect",
+        "bigquery": "BigQueryDialect",
+        "mysql": "MySQLDialect",
+        "sqlite": "SQLiteDialect",
+        "spark": "SparkDialect",
+    }.get(template, template_class)
+
+    new_class = f"{new_prefix}Dialect"
+
+    src = template_path.read_text(encoding="utf-8")
+    out = stub_method_bodies(src, template_class, new_class)
+
+    new_path.write_text(out, encoding="utf-8")
+
+    print(f"created {new_path.relative_to(repo_root)}")
+    print()
+    print("Next manual steps (in order):")
+    print(f"  1. Fill in SQL bodies in src/pycel2sql/dialect/{new_name}.py")
+    print(f"     (every method currently raises NotImplementedError).")
+    print(f"  2. Add `{new_name.upper()} = \"{new_name}\"` to DialectName in")
+    print(f"     src/pycel2sql/dialect/_base.py.")
+    print(f"  3. Register {new_class} in src/pycel2sql/dialect/__init__.py:")
+    print(f"     - import + add to __all__")
+    print(f"     - add DialectName.{new_name.upper()}: {new_class} to _REGISTRY")
+    print(f"  4. Export {new_class} from src/pycel2sql/__init__.py.")
+    print(f"  5. Update tests/conftest.py:")
+    print(f"     - add {new_name}_dialect fixture")
+    print(f"     - append {new_class}() to ALL_DIALECTS")
+    print(f"  6. Update tests/test_dialect_parametrized.py ALL_DIALECTS.")
+    print(f"  7. Create tests/test_{new_name}.py mirroring tests/test_duckdb.py.")
+    print(f"  8. Update README.md (badge, dialect count, placeholder table) and")
+    print(f"     CLAUDE.md (count, dialect-files list, Dialect Differences bullet).")
+    print(f"  9. Run: uv run ruff check src/ tests/ && \\")
+    print(f"            uv run pytest tests/ --ignore=tests/integration -v")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))