From d2a75be7b68814b9258e2b37960246b044437760 Mon Sep 17 00:00:00 2001 From: Kshitij <160704796+kshitij-ka@users.noreply.github.com> Date: Sun, 3 May 2026 17:25:25 +0530 Subject: [PATCH] fix: harden server input validation and prevent information leakage. --- web/server/bridge/retrieve.py | 6 +++--- web/server/index.js | 10 +++++----- web/server/services/retrieverService.js | 22 ++++++++++++++++++---- web/server/start.js | 14 ++++++++++---- 4 files changed, 36 insertions(+), 16 deletions(-) diff --git a/web/server/bridge/retrieve.py b/web/server/bridge/retrieve.py index 89716c5..3eb8bee 100644 --- a/web/server/bridge/retrieve.py +++ b/web/server/bridge/retrieve.py @@ -42,11 +42,11 @@ def main(): try: req = json.loads(raw_line) query = req.get("query", "") - top_n = int(req.get("top_n", 5)) + top_n = max(1, min(int(req.get("top_n", 5)), 20)) results, latency = retriever.retrieve(query, top_n=top_n) response = {"results": results, "latency_seconds": round(latency, 4)} - except Exception as exc: - response = {"error": str(exc)} + except Exception: + response = {"error": "retrieval_failed"} sys.stdout.write(json.dumps(response) + "\n") sys.stdout.flush() diff --git a/web/server/index.js b/web/server/index.js index 39fe056..f70b908 100644 --- a/web/server/index.js +++ b/web/server/index.js @@ -99,8 +99,8 @@ for (const c of chunks) { chunksByStd[c.standard_id].push(c); } -/** @type {RegExp} - Matches ASCII control characters that should be stripped from user input. */ -const CONTROL_CHAR_RE = /[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g; +/** @type {RegExp} - Matches ASCII control characters and Unicode BiDi override characters that should be stripped from user input. */ +const CONTROL_CHAR_RE = /[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\u202A-\u202E\u2066-\u2069]/g; /** * Strips control characters and truncates a string to a safe length. @@ -114,8 +114,8 @@ function sanitizeText(value, maxLen = 500) { return value.replace(CONTROL_CHAR_RE, "").slice(0, maxLen).trim(); } -/** @type {RegExp} - Accepts IS standard IDs: letters, digits, spaces, colons, parens, dots, hyphens, slashes. */ -const STANDARD_ID_RE = /^[A-Za-z0-9 :()./-]{1,60}$/; +/** @type {RegExp} - Accepts IS standard IDs: letters, digits, spaces, colons, parens, dots, hyphens. */ +const STANDARD_ID_RE = /^[A-Za-z0-9 :().-]{1,60}$/; /** * Returns true if the value is a well-formed IS standard identifier. @@ -336,7 +336,7 @@ app.post("/api/recommend", async (req, res) => { const totalMs = Date.now() - t0; log("POST /api/recommend", { - query: effectiveQuery, + query: sanitizeText(effectiveQuery, 200), results: retrieved.length, retrieval_ms: retrievalMs, llm_ms: llmMs, diff --git a/web/server/services/retrieverService.js b/web/server/services/retrieverService.js index 4c51f27..aeccfac 100644 --- a/web/server/services/retrieverService.js +++ b/web/server/services/retrieverService.js @@ -21,8 +21,14 @@ const { EventEmitter } = require("events"); const BRIDGE = path.join(__dirname, "../bridge/retrieve.py"); /** @type {string} - Repository root, used as cwd for the Python subprocess. */ const ROOT = path.join(__dirname, "../../.."); -/** @type {string} - Python executable; override with PYTHON_BIN env var. */ -const PYTHON = process.env.PYTHON_BIN || "python"; +/** @type {string} - Python executable; override with PYTHON_BIN env var (must be "python", "python3", or an absolute path to a Python interpreter). */ +const _pythonRaw = process.env.PYTHON_BIN || "python"; +const _PYTHON_ALLOWLIST = /^(python[23]?|\/[^\0]+)$/; +if (!_PYTHON_ALLOWLIST.test(_pythonRaw)) { + console.error(`[retriever] Invalid PYTHON_BIN value: ${JSON.stringify(_pythonRaw)}. Must be "python", "python3", or an absolute path.`); + process.exit(1); +} +const PYTHON = _pythonRaw; /** @type {number} - Maximum milliseconds to wait for the daemon to signal ready on cold start. */ const BOOT_TIMEOUT_MS = 90_000; @@ -131,9 +137,17 @@ class PythonRetriever extends EventEmitter { if (msg.error) { item.reject(new Error(msg.error)); } else { + const raw = Array.isArray(msg.results) ? msg.results : []; + const ALLOWED = new Set(["standard_id", "title", "category", "matched_section", "score"]); + const results = raw.map((r) => { + if (typeof r !== "object" || r === null) return null; + const safe = {}; + for (const k of ALLOWED) if (k in r) safe[k] = r[k]; + return safe; + }).filter(Boolean); item.resolve({ - results: msg.results || [], - latency_seconds: msg.latency_seconds ?? 0, + results, + latency_seconds: typeof msg.latency_seconds === "number" ? msg.latency_seconds : 0, }); } } diff --git a/web/server/start.js b/web/server/start.js index abd1645..cafced8 100644 --- a/web/server/start.js +++ b/web/server/start.js @@ -3,8 +3,14 @@ * Kills any process already on PORT before starting index.js. * Run with: node web/server/start.js */ -const { execSync, spawn } = require("child_process"); -const PORT = process.env.PORT || 5000; +const { execSync, spawnSync, spawn } = require("child_process"); + +const rawPort = process.env.PORT || "5000"; +const PORT = parseInt(rawPort, 10); +if (!Number.isInteger(PORT) || PORT < 1 || PORT > 65535) { + console.error(`[start] Invalid PORT value: ${JSON.stringify(rawPort)}`); + process.exit(1); +} function killPort(port) { try { @@ -18,10 +24,10 @@ function killPort(port) { } for (const pid of pids) { console.log(`[start] Killing stale process PID ${pid} on port ${port}`); - execSync(`taskkill /PID ${pid} /F`, { stdio: "ignore" }); + spawnSync("taskkill", ["/PID", pid, "/F"], { stdio: "ignore" }); } } else { - execSync(`fuser -k ${port}/tcp`, { stdio: "ignore" }); + spawnSync("fuser", [`${port}/tcp`, "-k"], { stdio: "ignore" }); } } catch { // No process on that port -- fine