From e69fcdfd7183d27ac7c95a51460702557c20f568 Mon Sep 17 00:00:00 2001 From: Raj Nandan Sharma Date: Thu, 18 Jun 2026 11:39:10 +0530 Subject: [PATCH] fix(database): isolate web and worker connection pools GET / was throwing KnexTimeoutError ("Timeout acquiring a connection") in production. Root cause was the connection pool, not the database: the single process (SvelteKit + cron scheduler + BullMQ workers) shared one pool capped at 10, while one GET / fans out ~6 queries. A couple of concurrent page loads, or a per-minute monitor burst overlapping a load, exceeded 10 and queued acquires blew past the 15s timeout. Postgres itself had 97 free slots the whole time and no leak. Split into two pools so background work can't starve page loads: - web pool (DATABASE_POOL_MAX, default 10) serves HTTP requests - worker pool (DATABASE_WORKER_POOL_MAX, default 5) serves background jobs Routing is by execution context via AsyncLocalStorage: q.createWorker (the single chokepoint all workers/schedulers flow through) runs each processor inside a worker-pool context, and BaseRepository.knex resolves the pool from that context, defaulting to the web pool. This keeps shared controllers correct whether they run in a request or a job. SQLite has no real pool and reuses a single connection, so the split is a no-op there. Co-Authored-By: Claude Opus 4.8 (1M context) --- knexfile.ts | 51 ++++++++++++++----- src/lib/server/db/db.ts | 4 +- src/lib/server/db/dbimpl.ts | 23 ++++++++- src/lib/server/db/poolContext.ts | 26 ++++++++++ src/lib/server/db/repositories/base.ts | 18 ++++++- src/lib/server/queues/q.ts | 11 +++- .../docs/content/v4/setup/database-setup.md | 12 +++-- .../content/v4/setup/environment-variables.md | 3 +- 8 files changed, 125 insertions(+), 23 deletions(-) create mode 100644 src/lib/server/db/poolContext.ts diff --git a/knexfile.ts b/knexfile.ts index 15e6b77b..e73b2b12 100644 --- a/knexfile.ts +++ b/knexfile.ts @@ -17,9 +17,24 @@ const intFromEnv = (name: string, fallback: number): number => { // TCP keepalive on pooled connections, on by default. Cloud networks (Railway, // Docker Swarm overlays, k8s) silently drop idle TCP connections; without // keepalive the pool keeps handing out dead sockets after an idle period or a -// database restart. See docs/adr/0003-fail-fast-self-healing-db-pool.md. +// database restart. See docs .../setup/database-setup.md. const keepAliveEnabled = process.env.DATABASE_KEEPALIVE !== "false"; +interface PoolConfig { + min: number; + max: number; + idleTimeoutMillis: number; + createTimeoutMillis: number; +} + +// Two pools share one process (Postgres/MySQL only): the WEB pool serves +// SvelteKit requests; the WORKER pool serves background jobs (BullMQ workers + +// schedulers, routed via src/lib/server/db/poolContext.ts). Isolating them +// stops a burst of background jobs from exhausting the connections that serve +// page loads. Budget across both pools: replicas * (web + worker) must stay +// under the database's max_connections. SQLite has no real pool and reuses a +// single connection, so the split does not apply there. +// // Pool defaults deviate from knex's on purpose: // - min 0: knex's min 2 connections are never reaped, so they are exactly the // ones that go stale and wedge the app until a manual restart @@ -27,14 +42,17 @@ const keepAliveEnabled = process.env.DATABASE_KEEPALIVE !== "false"; // knex's default 60s during a database outage // Tarn requires max >= 1 and min <= max; clamp so a bad env value can not // produce a pool that fails every acquire -const poolMax = Math.max(1, intFromEnv("DATABASE_POOL_MAX", 10)); -const poolMin = Math.min(intFromEnv("DATABASE_POOL_MIN", 0), poolMax); -const pool = { - min: poolMin, - max: poolMax, - idleTimeoutMillis: intFromEnv("DATABASE_IDLE_TIMEOUT_MS", 30000), - createTimeoutMillis: intFromEnv("DATABASE_CREATE_TIMEOUT_MS", 15000), -}; +const idleTimeoutMillis = intFromEnv("DATABASE_IDLE_TIMEOUT_MS", 30000); +const createTimeoutMillis = intFromEnv("DATABASE_CREATE_TIMEOUT_MS", 15000); +const poolMin = intFromEnv("DATABASE_POOL_MIN", 0); +const buildPool = (max: number): PoolConfig => ({ + min: Math.min(poolMin, max), + max, + idleTimeoutMillis, + createTimeoutMillis, +}); +const webPool = buildPool(Math.max(1, intFromEnv("DATABASE_POOL_MAX", 10))); +const workerPool = buildPool(Math.max(1, intFromEnv("DATABASE_WORKER_POOL_MAX", 5))); const acquireConnectionTimeout = intFromEnv("DATABASE_ACQUIRE_TIMEOUT_MS", 15000); interface KnexConfig { @@ -44,7 +62,7 @@ interface KnexConfig { client?: string; connection?: string | { filename: string } | Record; useNullAsDefault?: boolean; - pool?: typeof pool; + pool?: PoolConfig; acquireConnectionTimeout?: number; } @@ -57,6 +75,12 @@ const knexOb: KnexConfig = { }, databaseType, }; + +// Worker pool config for Postgres/MySQL — same connection as the web config, +// but with the worker pool. Stays null for SQLite (single shared connection), +// in which case the app reuses the web instance for background work too. +let workerKnexOb: KnexConfig | null = null; + console.log(`Configuring database with type ${databaseType}`); if (databaseType === "sqlite") { knexOb.client = "better-sqlite3"; @@ -70,8 +94,9 @@ if (databaseType === "sqlite") { connectionString: databaseURL, keepAlive: keepAliveEnabled, }; - knexOb.pool = pool; + knexOb.pool = webPool; knexOb.acquireConnectionTimeout = acquireConnectionTimeout; + workerKnexOb = { ...knexOb, pool: workerPool }; } else if (databaseType === "mysql") { knexOb.client = "mysql2"; knexOb.connection = { @@ -79,11 +104,13 @@ if (databaseType === "sqlite") { enableKeepAlive: keepAliveEnabled, keepAliveInitialDelay: 10000, }; - knexOb.pool = pool; + knexOb.pool = webPool; knexOb.acquireConnectionTimeout = acquireConnectionTimeout; + workerKnexOb = { ...knexOb, pool: workerPool }; } else { console.error("Invalid database type"); process.exit(1); } +export { workerKnexOb }; export default knexOb; diff --git a/src/lib/server/db/db.ts b/src/lib/server/db/db.ts index 30030355..fa447e1b 100644 --- a/src/lib/server/db/db.ts +++ b/src/lib/server/db/db.ts @@ -1,5 +1,5 @@ import DbImpl from "./dbimpl"; -import knexOb from "../../../../knexfile.js"; +import knexOb, { workerKnexOb } from "../../../../knexfile.js"; -const instance: DbImpl = new DbImpl(knexOb); +const instance: DbImpl = new DbImpl(knexOb, workerKnexOb); export default instance; diff --git a/src/lib/server/db/dbimpl.ts b/src/lib/server/db/dbimpl.ts index 63c6d3d9..a1b7af3f 100644 --- a/src/lib/server/db/dbimpl.ts +++ b/src/lib/server/db/dbimpl.ts @@ -1,5 +1,6 @@ import Knex from "knex"; import type { Knex as KnexType } from "knex"; +import { runWithWorkerKnex } from "./poolContext.js"; // Import all repositories import { MonitoringRepository } from "./repositories/monitoring.js"; @@ -29,6 +30,9 @@ export type * from "../types/db.js"; */ class DbImpl { private knex: KnexType; + // Dedicated pool for background jobs (Postgres/MySQL). Equals `knex` when + // there is no separate worker pool (e.g. SQLite). + private workerKnex: KnexType; // Domain repositories private monitoring!: MonitoringRepository; @@ -374,8 +378,11 @@ class DbImpl { deleteEmailTemplate!: EmailTemplateConfigRepository["deleteEmailTemplate"]; upsertEmailTemplate!: EmailTemplateConfigRepository["upsertEmailTemplate"]; - constructor(opts: KnexType.Config) { + constructor(opts: KnexType.Config, workerOpts?: KnexType.Config | null) { this.knex = Knex(opts); + // Separate pool for background jobs when configured (Postgres/MySQL); + // otherwise reuse the web pool (SQLite has a single connection). + this.workerKnex = workerOpts ? Knex(workerOpts) : this.knex; // Initialize repositories this.monitoring = new MonitoringRepository(this.knex); @@ -840,6 +847,15 @@ class DbImpl { async init(): Promise {} + /** + * Runs `fn` with all repository queries routed to the worker connection pool. + * Wrap background work (BullMQ job processors, schedulers) with this so a + * burst of jobs cannot exhaust the web pool that serves page loads. + */ + runInWorkerContext(fn: () => Promise): Promise { + return runWithWorkerKnex(this.workerKnex, fn); + } + /** Probes database connectivity with a trivial query. Never throws. */ async ping(): Promise { try { @@ -851,7 +867,10 @@ class DbImpl { } async close(): Promise { - return await this.knex.destroy(); + await this.knex.destroy(); + if (this.workerKnex !== this.knex) { + await this.workerKnex.destroy(); + } } } diff --git a/src/lib/server/db/poolContext.ts b/src/lib/server/db/poolContext.ts new file mode 100644 index 00000000..128870fa --- /dev/null +++ b/src/lib/server/db/poolContext.ts @@ -0,0 +1,26 @@ +import { AsyncLocalStorage } from "node:async_hooks"; +import type { Knex as KnexType } from "knex"; + +// Per-execution-context selection of the database connection pool. +// +// Kener runs SvelteKit requests, the cron scheduler, and the BullMQ workers in +// a single process, all sharing one Knex instance. A burst of background jobs +// could therefore exhaust the connection pool and time out user-facing page +// loads (KnexTimeoutError on acquire). To prevent that, background work runs +// against a dedicated worker pool: queues/q.ts wraps every job processor in +// runWithWorkerKnex(), and BaseRepository reads getWorkerKnex() so its queries +// route to that pool. Anything outside a job (requests, startup, migrations) +// has no store set and falls back to the web pool. +// +// See knexfile.ts for pool sizing and docs .../setup/database-setup.md. +const workerKnexStorage = new AsyncLocalStorage(); + +/** Runs `fn` with all repository queries routed to the worker pool `knex`. */ +export function runWithWorkerKnex(knex: KnexType, fn: () => Promise): Promise { + return workerKnexStorage.run(knex, fn); +} + +/** The worker pool for the current context, or undefined when not in a job. */ +export function getWorkerKnex(): KnexType | undefined { + return workerKnexStorage.getStore(); +} diff --git a/src/lib/server/db/repositories/base.ts b/src/lib/server/db/repositories/base.ts index eee29f39..525e1191 100644 --- a/src/lib/server/db/repositories/base.ts +++ b/src/lib/server/db/repositories/base.ts @@ -1,4 +1,5 @@ import type { Knex as KnexType } from "knex"; +import { getWorkerKnex } from "../poolContext.js"; // Filter types for queries export interface MonitorFilter { @@ -35,9 +36,22 @@ export interface CountResult { * Base repository class that provides access to the Knex instance */ export abstract class BaseRepository { - protected knex: KnexType; + private readonly fallbackKnex: KnexType; constructor(knex: KnexType) { - this.knex = knex; + this.fallbackKnex = knex; + } + + /** + * The Knex instance for the current execution context. + * + * Background jobs run inside a worker-pool context (set in queues/q.ts), so + * their queries use the dedicated worker connection pool. Everything else — + * SvelteKit requests, startup — falls back to the web pool this repository + * was constructed with. This keeps a burst of background jobs from exhausting + * the connections that serve page loads. See poolContext.ts and knexfile.ts. + */ + protected get knex(): KnexType { + return getWorkerKnex() ?? this.fallbackKnex; } } diff --git a/src/lib/server/queues/q.ts b/src/lib/server/queues/q.ts index dc235f11..5f0176a5 100644 --- a/src/lib/server/queues/q.ts +++ b/src/lib/server/queues/q.ts @@ -1,4 +1,5 @@ import { redisIOConnection } from "../redisConnector.js"; +import db from "../db/db.js"; import { Queue, Worker, @@ -40,7 +41,15 @@ export const createWorker = ( concurrency: 5, ...options, }; - return new Worker(queue.name, processor, opts); + // Route every job's database access to the worker pool. This is the single + // chokepoint all BullMQ workers and schedulers flow through, so wrapping here + // isolates background work from the web request pool (see db/poolContext.ts). + // Sandboxed (string/URL) processors run out-of-process and pass through. + const wrapped: Processor = + typeof processor === "function" + ? (job, token) => db.runInWorkerContext(() => Promise.resolve(processor(job, token))) + : processor; + return new Worker(queue.name, wrapped, opts); }; export default { diff --git a/src/routes/(docs)/docs/content/v4/setup/database-setup.md b/src/routes/(docs)/docs/content/v4/setup/database-setup.md index 5fa26527..525dd4bf 100644 --- a/src/routes/(docs)/docs/content/v4/setup/database-setup.md +++ b/src/routes/(docs)/docs/content/v4/setup/database-setup.md @@ -93,21 +93,27 @@ DATABASE_URL=mysql://kener:password@localhost:3306/kener For PostgreSQL and MySQL, Kener ships fail-fast, self-healing pool defaults: no permanently-idle connections, TCP keepalive on, and 15-second connection timeouts. This protects deployments on cloud networks (Railway, Docker Swarm overlays, Kubernetes) that silently drop idle TCP connections, which otherwise causes 500s after idle periods and can require a restart after a database outage. +Kener uses **two separate pools** so background work cannot starve page loads: a **web pool** (`DATABASE_POOL_MAX`) for HTTP requests and a **worker pool** (`DATABASE_WORKER_POOL_MAX`) for background jobs (monitor checks, alerting, scheduled tasks). A burst of background jobs can only exhaust the worker pool, leaving the web pool free to serve requests. + Override only if your setup needs it: | Variable | Description | Default | | ----------------------------- | --------------------------------------------------------------- | ------- | | `DATABASE_POOL_MIN` | Minimum pool connections (0 lets idle connections be reclaimed) | `0` | -| `DATABASE_POOL_MAX` | Maximum pool connections | `10` | +| `DATABASE_POOL_MAX` | Max connections for the **web** (HTTP request) pool | `10` | +| `DATABASE_WORKER_POOL_MAX` | Max connections for the **worker** (background job) pool | `5` | | `DATABASE_ACQUIRE_TIMEOUT_MS` | How long a query waits for a free connection before failing | `15000` | | `DATABASE_CREATE_TIMEOUT_MS` | How long a new connection attempt waits before failing | `15000` | | `DATABASE_IDLE_TIMEOUT_MS` | How long a connection may sit idle before being closed | `30000` | | `DATABASE_KEEPALIVE` | TCP keepalive on connections (`true`/`false`) | `true` | +> [!IMPORTANT] +> Budget your pools against the database's `max_connections`: `replicas × (DATABASE_POOL_MAX + DATABASE_WORKER_POOL_MAX)` must stay below it. On small managed Postgres tiers (often capped near 20–25 connections), keep the defaults or lower them. Each `GET /` fans out several queries, so a web pool that is too small causes `KnexTimeoutError` under concurrent traffic. + > [!TIP] > If your database is slow to accept connections (cold starts, cross-region), raise `DATABASE_ACQUIRE_TIMEOUT_MS` and `DATABASE_CREATE_TIMEOUT_MS` instead of disabling keepalive or raising `DATABASE_POOL_MIN`. -These variables have no effect on SQLite. +These variables have no effect on SQLite, which uses a single shared connection. ## Switching databases {#switching-databases} @@ -123,7 +129,7 @@ These variables have no effect on SQLite. - Connection failed: verify host, port, credentials, firewall. - Migration failed: ensure DB exists and user can `CREATE`/`ALTER`. - SQLite write error: ensure directory exists and is writable. -- `KnexTimeoutError: Timeout acquiring a connection`: the database is unreachable or too slow to accept connections — check database health first, then see [Connection pool tuning](#connection-pool-tuning). +- `KnexTimeoutError: Timeout acquiring a connection`: every pooled connection is busy, or the database is unreachable/too slow to accept new ones. If the database is healthy, the pool is too small for your concurrency — raise `DATABASE_POOL_MAX` (and `DATABASE_WORKER_POOL_MAX`) within your `max_connections` budget. See [Connection pool tuning](#connection-pool-tuning). - `Connection terminated unexpectedly` after idle periods: the network dropped an idle connection; keepalive (on by default) prevents this — verify `DATABASE_KEEPALIVE` is not set to `false`. ## Environment variables {#environment-variables} diff --git a/src/routes/(docs)/docs/content/v4/setup/environment-variables.md b/src/routes/(docs)/docs/content/v4/setup/environment-variables.md index fb081c49..dcfaeb64 100644 --- a/src/routes/(docs)/docs/content/v4/setup/environment-variables.md +++ b/src/routes/(docs)/docs/content/v4/setup/environment-variables.md @@ -242,7 +242,8 @@ SMTP_SECURE=1 | :---------------------------- | :----------------------------------------------------------- | :------------------------------------ | | `DATABASE_URL` | Full database connection string | `sqlite://./database/kener.sqlite.db` | | `DATABASE_POOL_MIN` | Minimum pool connections (PostgreSQL/MySQL) | `0` | -| `DATABASE_POOL_MAX` | Maximum pool connections (PostgreSQL/MySQL) | `10` | +| `DATABASE_POOL_MAX` | Max web/request pool connections (PostgreSQL/MySQL) | `10` | +| `DATABASE_WORKER_POOL_MAX` | Max background-job pool connections (PostgreSQL/MySQL) | `5` | | `DATABASE_ACQUIRE_TIMEOUT_MS` | Wait for a free connection before failing (PostgreSQL/MySQL) | `15000` | | `DATABASE_CREATE_TIMEOUT_MS` | Wait for a new connection before failing (PostgreSQL/MySQL) | `15000` | | `DATABASE_IDLE_TIMEOUT_MS` | Idle time before a connection is closed (PostgreSQL/MySQL) | `30000` |