Fixes the root cause that was silently dropping Stalwart's cert/setting writes, completes the public HTTPS endpoints, and captures the debugging knowledge. - docker-compose.yml: gate the ts-stalwart healthcheck on Postgres reachability (nc -z the-record-prod:5432) in addition to tailscaled health. Stalwart's depends_on: service_healthy can no longer release it into the window where the tailnet route to Postgres isn't up yet — which was failing table init and losing in-flight cert writes (-> rcgen). - caddy/caddy.json + README: add the :443 SNI fan-out. mta-sts / autoconfig / autodiscover pass through to stalwart:443 (Stalwart terminates TLS with its wildcard cert; no proxy_protocol on :443). All other SNIs go to the box's web Caddy on :8443 (https_port 8443). L7 reverse_proxy is impossible here: CAA pins issuance to Stalwart's ACME account, so Caddy can't obtain its own cert for these names. - acl-snippet.hujson: grant tcp:443 on reverse-proxy -> stalwart for the SNI pass-through. - config/config.json: track the v0.16 bootstrap (commit-safe; the DB secret is an EnvironmentVariable reference, not inline). - LESSONS.md: symptom -> cause -> fix notes (PG race, DNS-01/Spaceship dead key, auto-ban vs PROXY protocol, wildcard-requires-DNS-01, SNI pass-through, ephemeral sidecar IP, LE rate-limit checks). - .gitignore: exclude _backup/ and _validate/ (DB dumps + an inline-secret config) and editor swap files. NEVER commit those. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
83 lines
3.4 KiB
YAML
83 lines
3.4 KiB
YAML
# tailwart — Stalwart mailbox as a Tailscale sidecar (NO WAN presence).
|
|
#
|
|
# The container shares ts-stalwart's network namespace, so its only interfaces
|
|
# are lo and tailscale0. All mail ports listen on the tailnet only; the public
|
|
# edge is the separate caddy/ layer-4 proxy, which can run on another host.
|
|
#
|
|
# Prereq: the shared tailnet infra (Postgres the-record-prod, Redis
|
|
# slo-time-prod, Garage) must be up, and the stalwart role/db/bucket created
|
|
# (see README). Bring up: docker compose up -d
|
|
|
|
name: tailwart
|
|
|
|
services:
|
|
|
|
ts-stalwart:
|
|
image: tailscale/tailscale:latest
|
|
hostname: ${STALWART_MAGIC_NAME}
|
|
environment:
|
|
TS_AUTHKEY: ${TS_OAUTH_CLIENT_SECRET}?ephemeral=true
|
|
TS_EXTRA_ARGS: --advertise-tags=tag:stalwart
|
|
TS_HOSTNAME: ${STALWART_MAGIC_NAME}
|
|
TS_ACCEPT_DNS: "true"
|
|
TS_AUTH_ONCE: "true"
|
|
TS_USERSPACE: "false"
|
|
TS_ENABLE_HEALTH_CHECK: "true"
|
|
TS_LOCAL_ADDR_PORT: "127.0.0.1:9002"
|
|
dns: [1.1.1.1, 1.0.0.1]
|
|
devices:
|
|
- /dev/net/tun:/dev/net/tun
|
|
cap_add:
|
|
- NET_ADMIN
|
|
- NET_RAW
|
|
healthcheck:
|
|
# Healthy only when BOTH the tailnet link is up AND Postgres is reachable
|
|
# over it. The stalwart service gates on this (depends_on: service_healthy),
|
|
# so it can no longer start into the race where it tries the DB before the
|
|
# tailnet route exists — which logged "Failed to create tables" and dropped
|
|
# in-flight cert/setting writes (e.g. lost the ACME cert on 2026-06-10).
|
|
test: ["CMD-SHELL", "wget -qO- http://127.0.0.1:9002/healthz && nc -z -w3 ${DB_MAGIC_NAME}.${TS_TAILNET} 5432"]
|
|
interval: 10s
|
|
timeout: 8s
|
|
retries: 6
|
|
start_period: 30s
|
|
restart: unless-stopped
|
|
|
|
stalwart:
|
|
image: stalwartlabs/stalwart:v0.16.7
|
|
network_mode: "service:ts-stalwart"
|
|
environment:
|
|
# Consumed by config/config.toml via its %{env:NAME}% macros. Keeping
|
|
# secrets in env (not the mounted toml) means the toml is commit-safe.
|
|
STALWART_DB_NAME: ${STALWART_DB_NAME}
|
|
STALWART_DB_USER: ${STALWART_DB_USER}
|
|
STALWART_DB_PASSWORD: ${STALWART_DB_PASSWORD}
|
|
DB_HOST: ${DB_MAGIC_NAME}.${TS_TAILNET}
|
|
REDIS_URL: redis://${REDIS_MAGIC_NAME}.${TS_TAILNET}:6379/${STALWART_REDIS_DB}
|
|
S3_ENDPOINT: http://${GARAGE_MAGIC_NAME}.${TS_TAILNET}:3900
|
|
S3_REGION: ${GARAGE_REGION}
|
|
S3_BUCKET: ${STALWART_S3_BUCKET}
|
|
S3_ACCESS_KEY: ${GARAGE_ACCESS_KEY_ID}
|
|
S3_SECRET_KEY: ${GARAGE_SECRET_ACCESS_KEY}
|
|
STALWART_HOSTNAME: ${STALWART_HOSTNAME}
|
|
STALWART_DOMAIN: ${STALWART_DOMAIN}
|
|
STALWART_SMARTHOST: ${STALWART_SMARTHOST}
|
|
STALWART_FALLBACK_ADMIN_SECRET: ${STALWART_FALLBACK_ADMIN_SECRET}
|
|
volumes:
|
|
# Bootstrap config (v0.16 JSON): tells Stalwart only where Postgres lives;
|
|
# all other settings live in the DB. Mounted at the image's default
|
|
# --config path (/etc/stalwart/config.json). Secret comes from the
|
|
# STALWART_DB_PASSWORD env above, referenced via the EnvironmentVariable
|
|
# secret type inside the file — so this stays commit-safe.
|
|
- ./config/config.json:/etc/stalwart/config.json:ro
|
|
# Working dir: ACME cert cache + outbound queue spool. Named volume (not
|
|
# anonymous) so a recreate doesn't orphan it and drop queued mail/certs.
|
|
- stalwart-data:/var/lib/stalwart
|
|
depends_on:
|
|
ts-stalwart:
|
|
condition: service_healthy
|
|
restart: unless-stopped
|
|
|
|
volumes:
|
|
stalwart-data:
|