diff --git a/package-lock.json b/package-lock.json index d71bfb067..3a75d7840 100644 --- a/package-lock.json +++ b/package-lock.json @@ -43,6 +43,7 @@ "page": "^1.11.6", "pg": "^8.13.3", "priority-queue-typescript": "^1.0.1", + "prom-client": "^15.1.3", "protobufjs": "^7.3.2", "pureimage": "^0.4.13", "raphael": "^2.3.0", @@ -4890,6 +4891,15 @@ "node": ">=10" } }, + "node_modules/@opentelemetry/api": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", + "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", + "license": "Apache-2.0", + "engines": { + "node": ">=8.0.0" + } + }, "node_modules/@pkgjs/parseargs": { "version": "0.11.0", "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", @@ -7631,6 +7641,12 @@ "resolved": "https://registry.npmjs.org/binary-loader/-/binary-loader-0.0.1.tgz", "integrity": "sha512-LujAJL3IhYn4zVWZWAct9B6G5hdonNle+fWsG/u9QyY1OhOVp00jFgygUVr9SrBVw8doddyx9A/VS1/T9co4Dg==" }, + "node_modules/bintrees": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz", + "integrity": "sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==", + "license": "MIT" + }, "node_modules/bl": { "version": "6.0.16", "resolved": "https://registry.npmjs.org/bl/-/bl-6.0.16.tgz", @@ -15845,6 +15861,19 @@ "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", "license": "MIT" }, + "node_modules/prom-client": { + "version": "15.1.3", + "resolved": "https://registry.npmjs.org/prom-client/-/prom-client-15.1.3.tgz", + "integrity": "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.4.0", + "tdigest": "^0.1.1" + }, + "engines": { + "node": "^16 || ^18 || >=20" + } + }, "node_modules/promise-retry": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/promise-retry/-/promise-retry-2.0.1.tgz", @@ -17579,6 +17608,15 @@ "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", "license": "ISC" }, + "node_modules/tdigest": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/tdigest/-/tdigest-0.1.2.tgz", + "integrity": "sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==", + "license": "MIT", + "dependencies": { + "bintrees": "1.0.2" + } + }, "node_modules/teeny-request": { "version": "9.0.0", "resolved": "https://registry.npmjs.org/teeny-request/-/teeny-request-9.0.0.tgz", diff --git a/package.json b/package.json index 80dfc2883..ef6252574 100644 --- a/package.json +++ b/package.json @@ -110,6 +110,7 @@ "page": "^1.11.6", "pg": "^8.13.3", "priority-queue-typescript": "^1.0.1", + "prom-client": "^15.1.3", "protobufjs": "^7.3.2", "pureimage": "^0.4.13", "raphael": "^2.3.0", diff --git a/src/server/GameManager.ts b/src/server/GameManager.ts index 0029396a3..0d79f9a2d 100644 --- a/src/server/GameManager.ts +++ b/src/server/GameManager.ts @@ -40,6 +40,18 @@ export class GameManager { return game; } + activeGames(): number { + return this.games.size; + } + + activeClients(): number { + let totalClients = 0; + this.games.forEach((game: GameServer) => { + totalClients += game.activeClients.length; + }); + return totalClients; + } + tick() { const active = new Map(); for (const [id, game] of this.games) { diff --git a/src/server/Master.ts b/src/server/Master.ts index d5062d389..4a158b8f5 100644 --- a/src/server/Master.ts +++ b/src/server/Master.ts @@ -13,6 +13,7 @@ import path from "path"; import rateLimit from "express-rate-limit"; import { fileURLToPath } from "url"; import { gatekeeper, LimiterType } from "./Gatekeeper"; +import { setupMetricsServer } from "./MasterMetrics"; const config = getServerConfigFromServer(); const readyWorkers = new Set(); @@ -20,6 +21,10 @@ const readyWorkers = new Set(); const app = express(); const server = http.createServer(app); +// Create a separate metrics server on port 9090 +const metricsApp = express(); +const metricsServer = http.createServer(metricsApp); + const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); app.use(express.json()); @@ -135,6 +140,9 @@ export async function startMaster() { server.listen(PORT, () => { console.log(`Master HTTP server listening on port ${PORT}`); }); + + // Setup the metrics server + setupMetricsServer(); } app.get( diff --git a/src/server/MasterMetrics.ts b/src/server/MasterMetrics.ts new file mode 100644 index 000000000..bfc667d80 --- /dev/null +++ b/src/server/MasterMetrics.ts @@ -0,0 +1,79 @@ +import express from "express"; +import http from "http"; +import promClient from "prom-client"; +import { getServerConfigFromServer } from "../core/configuration/Config"; + +const config = getServerConfigFromServer(); + +// Create a separate metrics server on port 9090 +const metricsApp = express(); +const metricsServer = http.createServer(metricsApp); + +// Initialize the Prometheus registry for the master's own metrics +const register = new promClient.Registry(); + +// Prometheus metrics endpoint that gathers metrics from workers +export function setupMetricsServer() { + metricsApp.get("/metrics", async (req, res) => { + console.log("Metrics requested"); + try { + // Get the master's metrics + const masterMetrics = await register.metrics(); + + // Collect metrics from all workers + const workerMetricsPromises = []; + + // For each worker, fetch their metrics + for (let i = 0; i < config.numWorkers(); i++) { + const workerPort = config.workerPortByIndex(i); + const workerUrl = `http://localhost:${workerPort}/metrics`; + console.log(`Fetching metrics from worker ${i} at ${workerUrl}`); + const workerMetricsPromise = fetch(workerUrl, { + headers: { + [config.adminHeader()]: config.adminToken(), + }, + }) + .then((response) => { + if (!response.ok) { + throw new Error(`Worker ${i} returned status ${response.status}`); + } + return response.text(); + }) + .then((metricsText) => { + // Add worker label to each metric line + return metricsText.replace( + /^([a-z][a-z0-9_]*(?:{[^}]*})?)\s/gm, + `$1{worker="worker-${i}"} `, + ); + }) + .catch((error) => { + console.error(`Error fetching metrics from worker ${i}:`, error); + return `# Error fetching metrics from worker ${i}: ${error.message}`; + }); + workerMetricsPromises.push(workerMetricsPromise); + } + + // Wait for all worker metrics to be fetched + const workerMetricsArray = await Promise.all(workerMetricsPromises); + + // Add worker label to the master metrics + const masterMetricsWithLabel = masterMetrics.replace( + /^([a-z][a-z0-9_]*(?:{[^}]*})?)\s/gm, + `$1{worker="master"} `, + ); + + // Combine all metrics and send the response + res.set("Content-Type", register.contentType); + res.end(`${masterMetricsWithLabel}\n${workerMetricsArray.join("\n")}`); + } catch (error) { + console.error("Error collecting metrics:", error); + res.status(500).end(`# Error collecting metrics: ${error.message}`); + } + }); + + // Start the metrics server on port 9090 + const METRICS_PORT = 9090; + metricsServer.listen(METRICS_PORT, () => { + console.log(`Metrics server listening on port ${METRICS_PORT}`); + }); +} diff --git a/src/server/Worker.ts b/src/server/Worker.ts index f1fe0ecfa..b59598c79 100644 --- a/src/server/Worker.ts +++ b/src/server/Worker.ts @@ -17,6 +17,7 @@ import { slog } from "./StructuredLog"; import { GameType } from "../core/game/Game"; import { archive, readGameRecord } from "./Archive"; import { gatekeeper, LimiterType } from "./Gatekeeper"; +import { metrics } from "./WorkerMetrics"; const config = getServerConfigFromServer(); @@ -35,6 +36,11 @@ export function startWorker() { const gm = new GameManager(config); + // Set up periodic metrics updates + setInterval(() => { + metrics.updateGameMetrics(gm); + }, 15000); // Update every 15 seconds + // Middleware to handle /wX path prefix app.use((req, res, next) => { // Extract the original path without the worker prefix @@ -241,6 +247,24 @@ export function startWorker() { }), ); + app.get( + "/metrics", + gatekeeper.httpHandler(LimiterType.Get, async (req, res) => { + if (req.headers[config.adminHeader()] !== config.adminToken()) { + return res.status(403).end("Access denied"); + } + console.log(`metrics requested on worker ${workerId}`); + + try { + const metricsData = await metrics.register.metrics(); + res.set("Content-Type", metrics.register.contentType); + res.end(metricsData); + } catch (error) { + res.status(500).end(error.message); + } + }), + ); + // WebSocket handling wss.on("connection", (ws: WebSocket, req) => { ws.on( diff --git a/src/server/WorkerMetrics.ts b/src/server/WorkerMetrics.ts new file mode 100644 index 000000000..e20d92759 --- /dev/null +++ b/src/server/WorkerMetrics.ts @@ -0,0 +1,45 @@ +import promClient from "prom-client"; +import { GameManager } from "./GameManager"; + +// Initialize the Prometheus registry +const register = new promClient.Registry(); + +// Enable default Node.js metrics collection +promClient.collectDefaultMetrics({ register }); + +// Add worker-specific metrics +const activeGamesGauge = new promClient.Gauge({ + name: "active_games_count", + help: "Number of active games on this worker", + registers: [register], +}); + +const connectedClientsGauge = new promClient.Gauge({ + name: "connected_clients_count", + help: "Number of connected clients on this worker", + registers: [register], +}); + +const memoryUsageGauge = new promClient.Gauge({ + name: "memory_usage_bytes", + help: "Current memory usage of the worker process in bytes", + registers: [register], +}); + +// Export the metrics for use in the worker +export const metrics = { + register, + activeGamesGauge, + connectedClientsGauge, + memoryUsageGauge, + + // Function to update game-related metrics + updateGameMetrics: (gameManager: GameManager) => { + activeGamesGauge.set(gameManager.activeGames()); + connectedClientsGauge.set(gameManager.activeClients()); + + // Update memory usage metrics + const memoryUsage = process.memoryUsage(); + memoryUsageGauge.set(memoryUsage.heapUsed); + }, +}; diff --git a/update.sh b/update.sh index d9f67169f..31a97666f 100755 --- a/update.sh +++ b/update.sh @@ -77,7 +77,7 @@ if [ "$REGION" == "staging" ]; then fi echo "Starting new container for ${REGION} environment..." -docker run -d -p 80:80 \ +docker run -d -p 80:80 -p 127.0.0.1:9090:9090 \ --restart=always \ $VOLUME_MOUNTS \ --log-driver json-file \