From 05af154b589feb8c818597a0599ffb9a7083966d Mon Sep 17 00:00:00 2001 From: Josh Harris Date: Sat, 21 Feb 2026 22:52:47 +0000 Subject: [PATCH] feat(server): add health api endpoint for increased observability (#3264) ## Description: Adds an additional API endpoint to the server for health, using the master lobby service as the health metric. The master lobby service is considered healthy if the lobby service has started (i.e. it had enough ready workers to start), and the current amount of ready workers is more than half of the desired number. This means that we won't show as healthy until all the workers start, and then we will continue to show as healthy even if a few workers crash, as long as at least more than half are still running. Any less than that, and the service becomes unhealthy. This also is set to "no cache" in the nginx config. This is to ensure that any checks of the server health show the true value, and cannot show false/stale data served by nginx, cloudflare, or anything else. ## Please complete the following: - [x] I have added screenshots for all UI updates - [x] I process any text displayed to the user through translateText() and I've added it to the en.json file - [x] I have added relevant tests to the test directory - [x] I confirm I have thoroughly tested these changes and take full responsibility for any bugs introduced ## Please put your Discord username so you can be contacted if a bug or regression is found: jish --- nginx.conf | 21 +++- src/server/Master.ts | 9 ++ src/server/MasterLobbyService.ts | 7 ++ tests/server/MasterLobbyServiceHealth.test.ts | 119 ++++++++++++++++++ 4 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 tests/server/MasterLobbyServiceHealth.test.ts diff --git a/nginx.conf b/nginx.conf index 2da48c6af..8466fd86c 100644 --- a/nginx.conf +++ b/nginx.conf @@ -120,6 +120,25 @@ server { proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; } + + # /api/health endpoint - No caching, always hit the backend + location = /api/health { + proxy_pass http://127.0.0.1:3000; + proxy_http_version 1.1; + + # Cache configuration - No caching for health checks + proxy_cache off; + add_header X-Cache-Status "BYPASS"; + add_header Cache-Control "no-store, no-cache, must-revalidate, proxy-revalidate"; + add_header Pragma "no-cache"; + add_header Expires "0"; + + # Standard proxy headers + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } # /commit.txt endpoint - Cache for 5 seconds location = /commit.txt { @@ -250,4 +269,4 @@ server { proxy_set_header X-Forwarded-Proto $scheme; } -} \ No newline at end of file +} diff --git a/src/server/Master.ts b/src/server/Master.ts index a9322d8ff..90b9cf48f 100644 --- a/src/server/Master.ts +++ b/src/server/Master.ts @@ -145,6 +145,15 @@ app.get("/api/env", async (req, res) => { res.json(envConfig); }); +app.get("/api/health", (_req, res) => { + const ready = lobbyService?.isHealthy() ?? false; + if (ready) { + res.json({ status: "ok" }); + } else { + res.status(503).json({ status: "unavailable" }); + } +}); + // SPA fallback route app.get("*", async function (_req, res) { try { diff --git a/src/server/MasterLobbyService.ts b/src/server/MasterLobbyService.ts index 0dfe89285..31e58b55d 100644 --- a/src/server/MasterLobbyService.ts +++ b/src/server/MasterLobbyService.ts @@ -59,6 +59,13 @@ export class MasterLobbyService { this.readyWorkers.delete(workerId); } + isHealthy(): boolean { + // We consider the lobby service healthy if at least half of the workers are ready. + // This allows for some leeway if a worker crashes. + const minWorkers = Math.max(this.config.numWorkers() / 2, 1); + return this.started && this.readyWorkers.size >= minWorkers; + } + private handleWorkerReady(workerId: number) { this.readyWorkers.add(workerId); this.log.info( diff --git a/tests/server/MasterLobbyServiceHealth.test.ts b/tests/server/MasterLobbyServiceHealth.test.ts new file mode 100644 index 000000000..fe5800eac --- /dev/null +++ b/tests/server/MasterLobbyServiceHealth.test.ts @@ -0,0 +1,119 @@ +import EventEmitter from "events"; +import { describe, expect, it, vi } from "vitest"; +import { MasterLobbyService } from "../../src/server/MasterLobbyService"; +import { TestServerConfig } from "../util/TestServerConfig"; + +vi.mock("../../src/server/Logger", () => ({ + logger: { + child: () => ({ + error: vi.fn(), + info: vi.fn(), + }), + }, +})); + +vi.mock("../../src/server/PollingLoop", () => ({ + startPolling: vi.fn(), +})); + +function createMockWorker(): EventEmitter { + const emitter = new EventEmitter(); + (emitter as any).send = vi.fn(); + return emitter; +} + +function sendWorkerReady(worker: EventEmitter, workerId: number) { + worker.emit("message", { type: "workerReady", workerId }); +} + +function createService(numWorkers: number): MasterLobbyService { + const config = new TestServerConfig(); + vi.spyOn(config, "numWorkers").mockReturnValue(numWorkers); + const log = { info: vi.fn(), error: vi.fn() } as any; + return new MasterLobbyService(config, {} as any, log); +} + +function startAllWorkers( + service: MasterLobbyService, + count: number, +): { id: number; w: EventEmitter }[] { + const workers = Array.from({ length: count }, (_, i) => { + const id = i + 1; + const w = createMockWorker(); + service.registerWorker(id, w as any); + return { id, w }; + }); + for (const { w, id } of workers) { + sendWorkerReady(w, id); + } + return workers; +} + +describe("MasterLobbyService.isHealthy", () => { + it("unhealthy before any workers register", () => { + const service = createService(4); + expect(service.isHealthy()).toBe(false); + }); + + it("unhealthy when workers registered but not ready", () => { + const service = createService(2); + service.registerWorker(1, createMockWorker() as any); + expect(service.isHealthy()).toBe(false); + }); + + it("unhealthy when only some workers are ready (server not started)", () => { + const service = createService(4); + + // 1 of 4 ready -- not enough to flip `started` + const w1 = createMockWorker(); + service.registerWorker(1, w1 as any); + sendWorkerReady(w1, 1); + + expect(service.isHealthy()).toBe(false); + }); + + it("healthy once all workers are ready", () => { + const service = createService(2); + startAllWorkers(service, 2); + expect(service.isHealthy()).toBe(true); + }); + + it("stays healthy after a single worker crash", () => { + const service = createService(4); + startAllWorkers(service, 4); + + service.removeWorker(4); // 3 of 4 left, threshold is 2 + expect(service.isHealthy()).toBe(true); + }); + + it("goes unhealthy when too many workers crash", () => { + const service = createService(4); + startAllWorkers(service, 4); + + service.removeWorker(2); + service.removeWorker(3); + service.removeWorker(4); // 1 of 4 left, threshold is 2 + expect(service.isHealthy()).toBe(false); + }); + + it("single-worker setup goes unhealthy on crash", () => { + const service = createService(1); + startAllWorkers(service, 1); + expect(service.isHealthy()).toBe(true); + + service.removeWorker(1); + expect(service.isHealthy()).toBe(false); + }); + + it("odd worker count: threshold rounds up (3 workers)", () => { + const service = createService(3); + startAllWorkers(service, 3); + + // min = 3/2 = 1.5, so 2 ready is enough, 1 is not + service.removeWorker(3); + expect(service.isHealthy()).toBe(true); + + service.removeWorker(2); + expect(service.isHealthy()).toBe(false); + }); +});