From 638393efac4e05cea18639f30a9203479cb637eb Mon Sep 17 00:00:00 2001 From: Raj Nandan Sharma Date: Sat, 6 Jun 2026 21:06:06 +0530 Subject: [PATCH] refactor(database): implement connection pool tuning and health checks for improved reliability --- .../0003-fail-fast-self-healing-db-pool.md | 9 + knexfile.ts | 45 ++++- scripts/main.ts | 165 ++++++++++-------- src/error.html | 73 ++++++++ .../server/controllers/layoutController.ts | 6 +- .../server/controllers/siteDataController.ts | 17 +- src/lib/server/db/dbimpl.ts | 10 ++ .../docs/content/v4/setup/database-setup.md | 24 +++ .../docs/content/v4/setup/deployment.md | 15 +- .../content/v4/setup/environment-variables.md | 14 +- src/routes/+error.svelte | 86 +++++++++ 11 files changed, 376 insertions(+), 88 deletions(-) create mode 100644 docs/adr/0003-fail-fast-self-healing-db-pool.md create mode 100644 src/error.html create mode 100644 src/routes/+error.svelte diff --git a/docs/adr/0003-fail-fast-self-healing-db-pool.md b/docs/adr/0003-fail-fast-self-healing-db-pool.md new file mode 100644 index 00000000..99bdbec8 --- /dev/null +++ b/docs/adr/0003-fail-fast-self-healing-db-pool.md @@ -0,0 +1,9 @@ +# Fail-fast, self-healing database pool defaults + +`knexfile.ts` overrides knex's pool defaults for network databases (Postgres, MySQL): `pool.min` is 0 instead of 2, acquire/create timeouts are 15s instead of 60s/30s, and TCP keepalive is enabled on connections. All knobs are overridable via `DATABASE_*` env vars. + +Two production incidents drove this. On Railway, a Postgres outage caused every request to hang for knex's default 60s `acquireConnectionTimeout` before failing with `KnexTimeoutError`, and after the database recovered the app stayed broken until a manual restart. In Docker Swarm (#692), the overlay network's conntrack silently dropped idle TCP connections after ~20 minutes, so the first request after an idle period drew a dead socket from the pool and returned a 500; the reporter worked around it with server-side Postgres `tcp_keepalives_*` settings and asked for an application-level fix. + +Both share one root cause: knex keeps `pool.min` connections forever and never validates them. Those permanently-idle sockets are exactly the ones cloud networks (Railway proxies, Swarm overlays, k8s) silently kill, and after any database blip they wedge the pool with corpses. `min: 0` lets the reaper retire every idle connection (`idleTimeoutMillis` 30s, well under typical conntrack windows), keepalive lets the OS detect silently-dropped sockets, and the 15s timeouts turn a minute-long hang into a fast failure during an outage. + +The trade-off: a quiet instance pays connection setup on the first query after idle (tens of milliseconds), and a database that takes longer than 15s to accept connections will see failures where the old defaults would have waited a minute. Deployments with such databases can raise `DATABASE_ACQUIRE_TIMEOUT_MS` / `DATABASE_CREATE_TIMEOUT_MS` rather than the project reverting to defaults that wedge everyone else. diff --git a/knexfile.ts b/knexfile.ts index c95f42a5..a0df63c5 100644 --- a/knexfile.ts +++ b/knexfile.ts @@ -7,13 +7,41 @@ const databaseURLParts = databaseURL.split("://"); const databaseType = databaseURLParts[0]; const databasePath = databaseURLParts[1]; +const intFromEnv = (name: string, fallback: number): number => { + const raw = process.env[name]; + if (raw === undefined) return fallback; + const parsed = parseInt(raw, 10); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback; +}; + +// TCP keepalive on pooled connections, on by default. Cloud networks (Railway, +// Docker Swarm overlays, k8s) silently drop idle TCP connections; without +// keepalive the pool keeps handing out dead sockets after an idle period or a +// database restart. See docs/adr/0003-fail-fast-self-healing-db-pool.md. +const keepAliveEnabled = process.env.DATABASE_KEEPALIVE !== "false"; + +// Pool defaults deviate from knex's on purpose: +// - min 0: knex's min 2 connections are never reaped, so they are exactly the +// ones that go stale and wedge the app until a manual restart +// - 15s acquire/create timeouts: fail fast instead of hanging requests for +// knex's default 60s during a database outage +const pool = { + min: intFromEnv("DATABASE_POOL_MIN", 0), + max: intFromEnv("DATABASE_POOL_MAX", 10), + idleTimeoutMillis: intFromEnv("DATABASE_IDLE_TIMEOUT_MS", 30000), + createTimeoutMillis: intFromEnv("DATABASE_CREATE_TIMEOUT_MS", 15000), +}; +const acquireConnectionTimeout = intFromEnv("DATABASE_ACQUIRE_TIMEOUT_MS", 15000); + interface KnexConfig { migrations: { directory: string }; seeds: { directory: string }; databaseType: string; client?: string; - connection?: string | { filename: string }; + connection?: string | { filename: string } | Record; useNullAsDefault?: boolean; + pool?: typeof pool; + acquireConnectionTimeout?: number; } const knexOb: KnexConfig = { @@ -33,10 +61,21 @@ if (databaseType === "sqlite") { knexOb.useNullAsDefault = true; } else if (databaseType === "postgresql") { knexOb.client = "pg"; - knexOb.connection = databaseURL; + knexOb.connection = { + connectionString: databaseURL, + keepAlive: keepAliveEnabled, + }; + knexOb.pool = pool; + knexOb.acquireConnectionTimeout = acquireConnectionTimeout; } else if (databaseType === "mysql") { knexOb.client = "mysql2"; - knexOb.connection = databaseURL; + knexOb.connection = { + uri: databaseURL, + enableKeepAlive: keepAliveEnabled, + keepAliveInitialDelay: 10000, + }; + knexOb.pool = pool; + knexOb.acquireConnectionTimeout = acquireConnectionTimeout; } else { console.error("Invalid database type"); process.exit(1); diff --git a/scripts/main.ts b/scripts/main.ts index 6b7475d8..ecc0094a 100644 --- a/scripts/main.ts +++ b/scripts/main.ts @@ -6,6 +6,7 @@ import Startup from "../src/lib/server/startup.ts"; import shutdownSchedulers from "../src/lib/server/schedulers/shutdown.ts"; import shutdownQueues from "../src/lib/server/queues/shutdown.ts"; import dbInstance from "../src/lib/server/db/db.ts"; +import { redisConnection } from "../src/lib/server/redisConnector.ts"; import knex from "knex"; import knexOb from "../knexfile.js"; @@ -13,89 +14,113 @@ const PORT = process.env.PORT || 3000; const base = process.env.KENER_BASE_PATH || ""; async function start() { - // Dynamic import so BODY_SIZE_LIMIT from .env is available - // before the handler reads it at module top-level - const { handler } = await import("../build/handler.js"); + // Dynamic import so BODY_SIZE_LIMIT from .env is available + // before the handler reads it at module top-level + const { handler } = await import("../build/handler.js"); - const app: any = express(); - const db = knex(knexOb); + const app: any = express(); + const db = knex(knexOb); - app.get(base + "/healthcheck", (req: any, res: any) => { - res.end("ok"); - }); + // Caps a health probe at 2s so a wedged dependency can not hang the + // endpoint. A probe is healthy unless it throws, times out, or resolves false. + const probe = async (check: () => Promise): Promise => { + try { + const result = await Promise.race([ + check(), + new Promise((_, reject) => setTimeout(() => reject(new Error("health probe timeout")), 2000)), + ]); + return result !== false; + } catch { + return false; + } + }; - app.use(handler); + // Reports component health. Always 200 so healthcheck-driven restarters do + // not bounce the app while a dependency is down (a restart can not fix a + // dead database); pass ?strict=1 to get 503 when any component is down. + app.get(base + "/healthcheck", async (req: any, res: any) => { + const [dbOk, redisOk] = await Promise.all([probe(() => dbInstance.ping()), probe(() => redisConnection().ping())]); + const healthy = dbOk && redisOk; + const strict = req.query.strict === "1"; + res.status(strict && !healthy ? 503 : 200).json({ + status: healthy ? "ok" : "degraded", + db: dbOk, + redis: redisOk, + }); + }); - //migrations - async function runMigrations() { - try { - // Rename old .js migration entries to .ts in the knex_migrations table - // so Knex can find the renamed files on disk - const hasTable = await db.schema.hasTable("knex_migrations"); - if (hasTable) { - const oldJsMigrations = await db("knex_migrations").where("name", "like", "%.js"); - for (const row of oldJsMigrations) { - const newName = row.name.replace(/\.js$/, ".ts"); - await db("knex_migrations").where("id", row.id).update({ name: newName }); - console.log(`Renamed migration record: ${row.name} -> ${newName}`); - } - } + app.use(handler); - console.log("Running migrations..."); - await db.migrate.latest(); // Runs migrations to the latest state - console.log("Migrations completed successfully!"); - } catch (err) { - console.error("Error running migrations:", err); - } - } + //migrations + async function runMigrations() { + try { + // Rename old .js migration entries to .ts in the knex_migrations table + // so Knex can find the renamed files on disk + const hasTable = await db.schema.hasTable("knex_migrations"); + if (hasTable) { + const oldJsMigrations = await db("knex_migrations").where("name", "like", "%.js"); + for (const row of oldJsMigrations) { + const newName = row.name.replace(/\.js$/, ".ts"); + await db("knex_migrations").where("id", row.id).update({ name: newName }); + console.log(`Renamed migration record: ${row.name} -> ${newName}`); + } + } - //seed - async function runSeed() { - try { - console.log("Running seed..."); - await db.seed.run(); // Runs seed to the latest state - console.log("Seed completed successfully!"); - } catch (err) { - console.error("Error running seed:", err); - } - } + console.log("Running migrations..."); + await db.migrate.latest(); // Runs migrations to the latest state + console.log("Migrations completed successfully!"); + } catch (err) { + console.error("Error running migrations:", err); + } + } - app.listen(PORT, async () => { - await runMigrations(); - await runSeed(); - await db.destroy(); - Startup(); - console.log("Kener is running on port " + PORT + "!"); - }); + //seed + async function runSeed() { + try { + console.log("Running seed..."); + await db.seed.run(); // Runs seed to the latest state + console.log("Seed completed successfully!"); + } catch (err) { + console.error("Error running seed:", err); + } + } - // Graceful shutdown handler - async function gracefulShutdown(signal: string) { - console.log(`\nReceived ${signal}. Starting graceful shutdown...`); + app.listen(PORT, async () => { + await runMigrations(); + await runSeed(); + await db.destroy(); + Startup(); + console.log("Kener is running on port " + PORT + "!"); + }); - try { - console.log("Shutting down schedulers..."); - await shutdownSchedulers(); - console.log("Schedulers shut down successfully."); + // Graceful shutdown handler + async function gracefulShutdown(signal: string) { + console.log(`\nReceived ${signal}. Starting graceful shutdown...`); - console.log("Shutting down queues..."); - await shutdownQueues(); - console.log("Queues shut down successfully."); + try { + console.log("Shutting down schedulers..."); + await shutdownSchedulers(); + console.log("Schedulers shut down successfully."); - console.log("Closing database connection..."); - await dbInstance.close(); - console.log("Database connection closed successfully."); + console.log("Shutting down queues..."); + await shutdownQueues(); + console.log("Queues shut down successfully."); - console.log("Graceful shutdown completed."); - process.exit(0); - } catch (err) { - console.error("Error during graceful shutdown:", err); - process.exit(1); - } - } + console.log("Closing database connection..."); + await dbInstance.close(); + console.log("Database connection closed successfully."); - // Handle termination signals - process.on("SIGTERM", () => gracefulShutdown("SIGTERM")); - process.on("SIGINT", () => gracefulShutdown("SIGINT")); + console.log("Graceful shutdown completed."); + process.exit(0); + } catch (err) { + console.error("Error during graceful shutdown:", err); + process.exit(1); + } + } + + // Handle termination signals + process.on("SIGTERM", () => gracefulShutdown("SIGTERM")); + process.on("SIGINT", () => gracefulShutdown("SIGINT")); } start(); diff --git a/src/error.html b/src/error.html new file mode 100644 index 00000000..c282ff16 --- /dev/null +++ b/src/error.html @@ -0,0 +1,73 @@ + + + + + + + %sveltekit.status% — Status page temporarily unavailable + + + +
+

This status page is temporarily unavailable

+

We are having trouble serving this page right now. It usually resolves on its own.

+

This page will retry automatically in 30 seconds.

+
%sveltekit.status% · %sveltekit.error.message%
+
+ + diff --git a/src/lib/server/controllers/layoutController.ts b/src/lib/server/controllers/layoutController.ts index 35e8273b..88e6b081 100644 --- a/src/lib/server/controllers/layoutController.ts +++ b/src/lib/server/controllers/layoutController.ts @@ -7,8 +7,8 @@ import { GetLoggedInSession, GetLocaleFromCookie, GetUsersCount, + HasRequiredEnv, IsEmailSetup, - IsSetupComplete, } from "./controller.js"; import type { EventDisplaySettings, GlobalPageVisibilitySettings, SiteDateTimeFormat } from "$lib/types/site.js"; @@ -86,7 +86,9 @@ export async function GetLayoutServerData(cookies: Cookies, request: Request): P GetUsersCount(), ]); - const isSetupComplete = await IsSetupComplete(); + // Same check as IsSetupComplete, but reuses the site data fetched above + // instead of querying it a second time on every request + const isSetupComplete = HasRequiredEnv() && Object.keys(siteData).length > 0; const selectedLang = GetLocaleFromCookie(siteData, cookies); const siteStatusColors = siteData.colors; diff --git a/src/lib/server/controllers/siteDataController.ts b/src/lib/server/controllers/siteDataController.ts index ce5e78be..fefd8d42 100644 --- a/src/lib/server/controllers/siteDataController.ts +++ b/src/lib/server/controllers/siteDataController.ts @@ -154,14 +154,17 @@ export const GetSiteDataByKey = async (key: string): Promise => { return data.value; }; +/** Checks the env vars required for setup, without touching the database. */ +export const HasRequiredEnv = (): boolean => { + return ( + process.env.KENER_SECRET_KEY !== undefined && + process.env.ORIGIN !== undefined && + process.env.REDIS_URL !== undefined + ); +}; + export const IsSetupComplete = async (): Promise => { - if (process.env.KENER_SECRET_KEY === undefined) { - return false; - } - if (process.env.ORIGIN === undefined) { - return false; - } - if (process.env.REDIS_URL === undefined) { + if (!HasRequiredEnv()) { return false; } let data = await db.getAllSiteData(); diff --git a/src/lib/server/db/dbimpl.ts b/src/lib/server/db/dbimpl.ts index b7621a19..fac63464 100644 --- a/src/lib/server/db/dbimpl.ts +++ b/src/lib/server/db/dbimpl.ts @@ -834,6 +834,16 @@ class DbImpl { async init(): Promise {} + /** Probes database connectivity with a trivial query. Never throws. */ + async ping(): Promise { + try { + await this.knex.raw("select 1"); + return true; + } catch { + return false; + } + } + async close(): Promise { return await this.knex.destroy(); } diff --git a/src/routes/(docs)/docs/content/v4/setup/database-setup.md b/src/routes/(docs)/docs/content/v4/setup/database-setup.md index 98c65ecb..5fa26527 100644 --- a/src/routes/(docs)/docs/content/v4/setup/database-setup.md +++ b/src/routes/(docs)/docs/content/v4/setup/database-setup.md @@ -89,6 +89,26 @@ Use when MySQL/MariaDB is your standard stack. DATABASE_URL=mysql://kener:password@localhost:3306/kener ``` +## Connection pool tuning {#connection-pool-tuning} + +For PostgreSQL and MySQL, Kener ships fail-fast, self-healing pool defaults: no permanently-idle connections, TCP keepalive on, and 15-second connection timeouts. This protects deployments on cloud networks (Railway, Docker Swarm overlays, Kubernetes) that silently drop idle TCP connections, which otherwise causes 500s after idle periods and can require a restart after a database outage. + +Override only if your setup needs it: + +| Variable | Description | Default | +| ----------------------------- | --------------------------------------------------------------- | ------- | +| `DATABASE_POOL_MIN` | Minimum pool connections (0 lets idle connections be reclaimed) | `0` | +| `DATABASE_POOL_MAX` | Maximum pool connections | `10` | +| `DATABASE_ACQUIRE_TIMEOUT_MS` | How long a query waits for a free connection before failing | `15000` | +| `DATABASE_CREATE_TIMEOUT_MS` | How long a new connection attempt waits before failing | `15000` | +| `DATABASE_IDLE_TIMEOUT_MS` | How long a connection may sit idle before being closed | `30000` | +| `DATABASE_KEEPALIVE` | TCP keepalive on connections (`true`/`false`) | `true` | + +> [!TIP] +> If your database is slow to accept connections (cold starts, cross-region), raise `DATABASE_ACQUIRE_TIMEOUT_MS` and `DATABASE_CREATE_TIMEOUT_MS` instead of disabling keepalive or raising `DATABASE_POOL_MIN`. + +These variables have no effect on SQLite. + ## Switching databases {#switching-databases} 1. Backup/export data. @@ -103,9 +123,13 @@ DATABASE_URL=mysql://kener:password@localhost:3306/kener - Connection failed: verify host, port, credentials, firewall. - Migration failed: ensure DB exists and user can `CREATE`/`ALTER`. - SQLite write error: ensure directory exists and is writable. +- `KnexTimeoutError: Timeout acquiring a connection`: the database is unreachable or too slow to accept connections — check database health first, then see [Connection pool tuning](#connection-pool-tuning). +- `Connection terminated unexpectedly` after idle periods: the network dropped an idle connection; keepalive (on by default) prevents this — verify `DATABASE_KEEPALIVE` is not set to `false`. ## Environment variables {#environment-variables} | Variable | Description | Default | Required | | -------------- | -------------------------- | ------------------------------------- | -------- | | `DATABASE_URL` | Database connection string | `sqlite://./database/kener.sqlite.db` | No | + +Pool tuning variables are listed in [Connection pool tuning](#connection-pool-tuning). diff --git a/src/routes/(docs)/docs/content/v4/setup/deployment.md b/src/routes/(docs)/docs/content/v4/setup/deployment.md index a93dcc16..8c597ab8 100644 --- a/src/routes/(docs)/docs/content/v4/setup/deployment.md +++ b/src/routes/(docs)/docs/content/v4/setup/deployment.md @@ -285,10 +285,21 @@ curl -fsS https://your-domain/healthcheck Expected response body: -```text -ok +```json +{ "status": "ok", "db": true, "redis": true } ``` +`status` is `degraded` when the database or Redis is unreachable. The endpoint always returns HTTP 200 so healthcheck-driven restarters do not bounce the app while a dependency is down. + +For orchestrators that should act on dependency health (load balancer readiness, alerting), pass `?strict=1` to get HTTP 503 when any component is down: + +```bash +curl -fsS https://your-domain/healthcheck?strict=1 +``` + +> [!WARNING] +> Do not point a restart-on-failure healthcheck (Docker `HEALTHCHECK`, Railway) at `?strict=1` — restarting Kener can not fix a dead database and will loop for the whole outage. + ## Next steps {#next-steps} - For reverse proxy and TLS setup, continue with [Reverse Proxy Setup](/docs/v4/guides/reverse-proxy). diff --git a/src/routes/(docs)/docs/content/v4/setup/environment-variables.md b/src/routes/(docs)/docs/content/v4/setup/environment-variables.md index 27a5b9f6..d4b51b1f 100644 --- a/src/routes/(docs)/docs/content/v4/setup/environment-variables.md +++ b/src/routes/(docs)/docs/content/v4/setup/environment-variables.md @@ -238,9 +238,15 @@ SMTP_SECURE=1 ### Database Configuration {#database-configuration} -| Variable | Description | Default | -| :------------- | :------------------------------ | :----------------------------- | -| `DATABASE_URL` | Full database connection string | `sqlite://./database/kener.db` | +| Variable | Description | Default | +| :---------------------------- | :----------------------------------------------------------- | :----------------------------- | +| `DATABASE_URL` | Full database connection string | `sqlite://./database/kener.db` | +| `DATABASE_POOL_MIN` | Minimum pool connections (PostgreSQL/MySQL) | `0` | +| `DATABASE_POOL_MAX` | Maximum pool connections (PostgreSQL/MySQL) | `10` | +| `DATABASE_ACQUIRE_TIMEOUT_MS` | Wait for a free connection before failing (PostgreSQL/MySQL) | `15000` | +| `DATABASE_CREATE_TIMEOUT_MS` | Wait for a new connection before failing (PostgreSQL/MySQL) | `15000` | +| `DATABASE_IDLE_TIMEOUT_MS` | Idle time before a connection is closed (PostgreSQL/MySQL) | `30000` | +| `DATABASE_KEEPALIVE` | TCP keepalive on connections (PostgreSQL/MySQL) | `true` | **Supported Databases**: @@ -261,7 +267,7 @@ DATABASE_URL=postgresql://user:password@localhost:5432/kener DATABASE_URL=mysql://user:password@localhost:3306/kener ``` -📖 **See**: [Database Setup Guide](/docs/v4/setup/database-setup) for migration guides and best practices. +📖 **See**: [Database Setup Guide](/docs/v4/setup/database-setup) for migration guides and [connection pool tuning](/docs/v4/setup/database-setup#connection-pool-tuning) for when to change the pool variables. ### Redis Configuration {#redis-configuration} diff --git a/src/routes/+error.svelte b/src/routes/+error.svelte new file mode 100644 index 00000000..628c05e7 --- /dev/null +++ b/src/routes/+error.svelte @@ -0,0 +1,86 @@ + + + + {page.status} — {isServerFailure ? "Status page temporarily unavailable" : "Something went wrong"} + {#if isServerFailure} + + {/if} + + +
+
+ {#if isServerFailure} +

This status page is temporarily unavailable

+

We are having trouble serving this page right now. It usually resolves on its own.

+

This page will retry automatically in 30 seconds.

+ {:else} +

Something went wrong

+

{page.error?.message || "The page you requested could not be loaded."}

+ {/if} +
{page.status}
+
+
+ +