feat(server): add health api endpoint for increased observability (#3264)

## Description:

Adds an additional API endpoint to the server for health, using the
master lobby service as the health metric. The master lobby service is
considered healthy if the lobby service has started (i.e. it had enough
ready workers to start), and the current amount of ready workers is more
than half of the desired number.

This means that we won't show as healthy until all the workers start,
and then we will continue to show as healthy even if a few workers
crash, as long as at least more than half are still running. Any less
than that, and the service becomes unhealthy.

This also is set to "no cache" in the nginx config. This is to ensure
that any checks of the server health show the true value, and cannot
show false/stale data served by nginx, cloudflare, or anything else.

## Please complete the following:

- [x] I have added screenshots for all UI updates
- [x] I process any text displayed to the user through translateText()
and I've added it to the en.json file
- [x] I have added relevant tests to the test directory
- [x] I confirm I have thoroughly tested these changes and take full
responsibility for any bugs introduced

## Please put your Discord username so you can be contacted if a bug or
regression is found:

jish
This commit is contained in:
Josh Harris
2026-02-21 22:52:47 +00:00
committed by GitHub
parent f09d9a3a5f
commit 05af154b58
4 changed files with 155 additions and 1 deletions
+20 -1
View File
@@ -120,6 +120,25 @@ server {
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# /api/health endpoint - No caching, always hit the backend
location = /api/health {
proxy_pass http://127.0.0.1:3000;
proxy_http_version 1.1;
# Cache configuration - No caching for health checks
proxy_cache off;
add_header X-Cache-Status "BYPASS";
add_header Cache-Control "no-store, no-cache, must-revalidate, proxy-revalidate";
add_header Pragma "no-cache";
add_header Expires "0";
# Standard proxy headers
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# /commit.txt endpoint - Cache for 5 seconds
location = /commit.txt {
@@ -250,4 +269,4 @@ server {
proxy_set_header X-Forwarded-Proto $scheme;
}
}
}
+9
View File
@@ -145,6 +145,15 @@ app.get("/api/env", async (req, res) => {
res.json(envConfig);
});
app.get("/api/health", (_req, res) => {
const ready = lobbyService?.isHealthy() ?? false;
if (ready) {
res.json({ status: "ok" });
} else {
res.status(503).json({ status: "unavailable" });
}
});
// SPA fallback route
app.get("*", async function (_req, res) {
try {
+7
View File
@@ -59,6 +59,13 @@ export class MasterLobbyService {
this.readyWorkers.delete(workerId);
}
isHealthy(): boolean {
// We consider the lobby service healthy if at least half of the workers are ready.
// This allows for some leeway if a worker crashes.
const minWorkers = Math.max(this.config.numWorkers() / 2, 1);
return this.started && this.readyWorkers.size >= minWorkers;
}
private handleWorkerReady(workerId: number) {
this.readyWorkers.add(workerId);
this.log.info(
@@ -0,0 +1,119 @@
import EventEmitter from "events";
import { describe, expect, it, vi } from "vitest";
import { MasterLobbyService } from "../../src/server/MasterLobbyService";
import { TestServerConfig } from "../util/TestServerConfig";
vi.mock("../../src/server/Logger", () => ({
logger: {
child: () => ({
error: vi.fn(),
info: vi.fn(),
}),
},
}));
vi.mock("../../src/server/PollingLoop", () => ({
startPolling: vi.fn(),
}));
function createMockWorker(): EventEmitter {
const emitter = new EventEmitter();
(emitter as any).send = vi.fn();
return emitter;
}
function sendWorkerReady(worker: EventEmitter, workerId: number) {
worker.emit("message", { type: "workerReady", workerId });
}
function createService(numWorkers: number): MasterLobbyService {
const config = new TestServerConfig();
vi.spyOn(config, "numWorkers").mockReturnValue(numWorkers);
const log = { info: vi.fn(), error: vi.fn() } as any;
return new MasterLobbyService(config, {} as any, log);
}
function startAllWorkers(
service: MasterLobbyService,
count: number,
): { id: number; w: EventEmitter }[] {
const workers = Array.from({ length: count }, (_, i) => {
const id = i + 1;
const w = createMockWorker();
service.registerWorker(id, w as any);
return { id, w };
});
for (const { w, id } of workers) {
sendWorkerReady(w, id);
}
return workers;
}
describe("MasterLobbyService.isHealthy", () => {
it("unhealthy before any workers register", () => {
const service = createService(4);
expect(service.isHealthy()).toBe(false);
});
it("unhealthy when workers registered but not ready", () => {
const service = createService(2);
service.registerWorker(1, createMockWorker() as any);
expect(service.isHealthy()).toBe(false);
});
it("unhealthy when only some workers are ready (server not started)", () => {
const service = createService(4);
// 1 of 4 ready -- not enough to flip `started`
const w1 = createMockWorker();
service.registerWorker(1, w1 as any);
sendWorkerReady(w1, 1);
expect(service.isHealthy()).toBe(false);
});
it("healthy once all workers are ready", () => {
const service = createService(2);
startAllWorkers(service, 2);
expect(service.isHealthy()).toBe(true);
});
it("stays healthy after a single worker crash", () => {
const service = createService(4);
startAllWorkers(service, 4);
service.removeWorker(4); // 3 of 4 left, threshold is 2
expect(service.isHealthy()).toBe(true);
});
it("goes unhealthy when too many workers crash", () => {
const service = createService(4);
startAllWorkers(service, 4);
service.removeWorker(2);
service.removeWorker(3);
service.removeWorker(4); // 1 of 4 left, threshold is 2
expect(service.isHealthy()).toBe(false);
});
it("single-worker setup goes unhealthy on crash", () => {
const service = createService(1);
startAllWorkers(service, 1);
expect(service.isHealthy()).toBe(true);
service.removeWorker(1);
expect(service.isHealthy()).toBe(false);
});
it("odd worker count: threshold rounds up (3 workers)", () => {
const service = createService(3);
startAllWorkers(service, 3);
// min = 3/2 = 1.5, so 2 ready is enough, 1 is not
service.removeWorker(3);
expect(service.isHealthy()).toBe(true);
service.removeWorker(2);
expect(service.isHealthy()).toBe(false);
});
});