docs: add JSDoc and normalize comments across server.

2026-05-03 00:16:42 +05:30
parent 33fe20021a
commit f88a45968a
5 changed files with 127 additions and 84 deletions
@@ -6,9 +6,9 @@ from stdin and writes newline-delimited JSON responses to stdout forever.
 Protocol (one line each direction):
  <- {"query": "...", "top_n": 5}
  -> {"results": [...], "latency_seconds": 0.15}
-  -> {"error": "..."}          (on failure — process stays alive)
+  -> {"error": "..."}          (on failure -- process stays alive)
-inference.py is imported as a module — zero lines of it are modified.
+inference.py is imported as a module -- zero lines of it are modified.
 """
 import sys
 import json
@@ -21,7 +21,7 @@ os.chdir(ROOT)
 import inference  # noqa: E402
 def main():
-    # Load once — this is the expensive step (~18s cold, ~0s warm)
+    # Load once -- this is the expensive step (~18s cold, ~0s warm)
    try:
        _, retriever = inference.load_or_build(force_rebuild=False)
    except Exception as exc:
@@ -11,10 +11,11 @@ const { generateExplanation, answerQuestion, rewriteQuery } = require("./service
 const { retrieve } = require("./services/retrieverService");
 const app  = express();
 /** @type {number} - HTTP port, defaults to 5000. */
 const PORT = process.env.PORT || 5000;
-// ── Startup checks ──────────────────────────────────────────────────────────
+// Warn early when the Groq key is absent so AI degradation is visible at boot.
 if (!process.env.GROQ_API_KEY) {
  console.warn(
    "[WARN] GROQ_API_KEY is not set. AI features will return fallback values.\n" +
@@ -22,12 +23,12 @@ if (!process.env.GROQ_API_KEY) {
  );
 }
 // ── Security headers ─────────────────────────────────────────────────────────
 app.use(helmet());
-// ── CORS — restrict to configured origin or localhost dev ────────────────────
+/**
-
+ * @type {string[]} - Allowed CORS origins; reads CORS_ORIGIN env var (comma-separated) or
 *   falls back to localhost dev/preview ports.
 */
 const ALLOWED_ORIGINS = process.env.CORS_ORIGIN
  ? process.env.CORS_ORIGIN.split(",").map((o) => o.trim())
  : ["http://localhost:5173", "http://localhost:4173", `http://localhost:${PORT}`];
@@ -42,8 +43,7 @@ app.use(cors({
  allowedHeaders: ["Content-Type"],
 }));
-// ── Rate limiting ─────────────────────────────────────────────────────────────
+/** @type {import('express-rate-limit').RateLimitRequestHandler} - 60 req/min applied to all /api/ routes. */
 const apiLimiter = rateLimit({
  windowMs: 60 * 1000,
  max: 60,
@@ -52,6 +52,7 @@ const apiLimiter = rateLimit({
  message: { error: "Too many requests. Please wait a moment and try again." },
 });
 /** @type {import('express-rate-limit').RateLimitRequestHandler} - 20 req/min applied to LLM-backed endpoints. */
 const llmLimiter = rateLimit({
  windowMs: 60 * 1000,
  max: 20,
@@ -67,8 +68,7 @@ app.use("/api/chat",      llmLimiter);
 app.use(express.json({ limit: "16kb" }));
-// ── Load data ───────────────────────────────────────────────────────────────
+/** @type {string} - Absolute path to the processed data directory. */
 const DATA_DIR = path.join(__dirname, "../../data/processed");
 let standards = [];
@@ -84,7 +84,7 @@ try {
 // Pre-build lookups
 const standardsById  = {};
-const chunksByStd    = {};   // standard_id → [chunk, …]
+const chunksByStd    = {};   // standard_id: [chunk, ...]
 const byCategory     = {};
 const categories     = new Set();
@@ -99,34 +99,59 @@ for (const c of chunks) {
  chunksByStd[c.standard_id].push(c);
 }
-// ── Input sanitization ────────────────────────────────────────────────────────
+/** @type {RegExp} - Matches ASCII control characters that should be stripped from user input. */
 const CONTROL_CHAR_RE = /[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g;
 /**
 * Strips control characters and truncates a string to a safe length.
 * Returns an empty string if the value is not a string.
 * @param {*} value
 * @param {number} [maxLen=500]
 * @returns {string}
 */
 function sanitizeText(value, maxLen = 500) {
  if (typeof value !== "string") return "";
  return value.replace(CONTROL_CHAR_RE, "").slice(0, maxLen).trim();
 }
-// standard_id must match IS identifier pattern: letters/digits/spaces/colons/parens/dots/hyphens
+/** @type {RegExp} - Accepts IS standard IDs: letters, digits, spaces, colons, parens, dots, hyphens, slashes. */
 const STANDARD_ID_RE = /^[A-Za-z0-9 :()./-]{1,60}$/;
 /**
 * Returns true if the value is a well-formed IS standard identifier.
 * @param {*} id
 * @returns {boolean}
 */
 function isValidStandardId(id) {
  return typeof id === "string" && STANDARD_ID_RE.test(id.trim());
 }
-// ── Structured logger ───────────────────────────────────────────────────────
+/**
-
+ * Writes a structured JSON log line to stdout with a UTC timestamp.
 * @param {string} endpoint - Route label, e.g. "POST /api/recommend".
 * @param {object} data - Arbitrary key/value pairs to include in the log entry.
 */
 function log(endpoint, data) {
  const ts = new Date().toISOString();
  console.log(`[${ts}] ${endpoint} |`, JSON.stringify(data));
 }
-// ── Keyword-based search helper (unchanged from original) ───────────────────
+/**
-
+ * Normalises a string to lowercase alphanumeric tokens for keyword matching.
 * @param {string} str
 * @returns {string}
 */
 function normalize(str) {
  return str.toLowerCase().replace(/[^a-z0-9]/g, " ").replace(/\s+/g, " ").trim();
 }
 /**
 * Scores a standard against a query using weighted keyword matching across id, title,
 * keywords, summary, and category fields.
 * @param {object} standard - A standards.json record.
 * @param {string} query - Raw search query string.
 * @returns {number} Relevance score; higher is more relevant.
 */
 function scoreStandard(standard, query) {
  const q       = normalize(query);
  const qTokens = q.split(" ").filter(Boolean);
@@ -150,8 +175,13 @@ function scoreStandard(standard, query) {
  return s;
 }
-// ── Best chunk selector ─────────────────────────────────────────────────────
+/**
-
+ * Returns the chunk from a standard that best matches the given question via token overlap.
 * Falls back to the first chunk if no tokens produce a positive score.
 * @param {string} standardId - IS standard identifier key into chunksByStd.
 * @param {string} question - User question used for token matching.
 * @returns {{ text: string, section: string, chunk_id: string, standard_id: string } | null}
 */
 function bestChunk(standardId, question) {
  const stdChunks = chunksByStd[standardId] || [];
  if (!stdChunks.length) return null;
@@ -168,11 +198,13 @@ function bestChunk(standardId, question) {
  return best;
 }
 // ═══════════════════════════════════════════════════════════════════════════
 // Routes
 // ═══════════════════════════════════════════════════════════════════════════
-// ── GET /api/standards ──────────────────────────────────────────────────────
+/**
 * GET /api/standards
 * Returns a paginated, optionally filtered and keyword-scored list of standards.
 * Query params: q (search string), category, page, limit.
 */
 app.get("/api/standards", (req, res) => {
  const q        = sanitizeText(req.query.q || "", 200);
  const category = sanitizeText(req.query.category || "", 100);
@@ -197,7 +229,10 @@ app.get("/api/standards", (req, res) => {
  res.json({ data: paginated, meta: { total, page: pageNum, limit: limitNum, totalPages } });
 });
-// ── GET /api/standards/:id ──────────────────────────────────────────────────
+/**
 * GET /api/standards/:id
 * Returns a single standard by its IS identifier; 404 if not found, 400 if the id is malformed.
 */
 app.get("/api/standards/:id", (req, res) => {
  const raw = decodeURIComponent(req.params.id);
  if (!isValidStandardId(raw)) {
@@ -208,7 +243,10 @@ app.get("/api/standards/:id", (req, res) => {
  res.json(standard);
 });
-// ── GET /api/categories ─────────────────────────────────────────────────────
+/**
 * GET /api/categories
 * Returns all categories sorted alphabetically, each with its standard count.
 */
 app.get("/api/categories", (req, res) => {
  const result = [...categories].sort().map((cat) => ({
    name:  cat,
@@ -217,7 +255,10 @@ app.get("/api/categories", (req, res) => {
  res.json(result);
 });
-// ── GET /api/stats ──────────────────────────────────────────────────────────
+/**
 * GET /api/stats
 * Returns aggregate counts of standards, categories, and chunks loaded in memory.
 */
 app.get("/api/stats", (req, res) => {
  res.json({
    totalStandards:  standards.length,
@@ -226,16 +267,13 @@ app.get("/api/stats", (req, res) => {
  });
 });
 // ── POST /api/recommend ─────────────────────────────────────────────────────
 /**
- * Input:  { query: string, top_n?: number, rewrite?: boolean }
+ * POST /api/recommend
- * Flow:
+ * Hybrid retrieval endpoint: optionally rewrites the query, calls the Python daemon,
- *   1. Optionally rewrite query with LLM (parallel, non-blocking on failure)
+ * then attaches parallel LLM explanations to each result.
 *   2. Call Python inference.py via bridge (retrieval logic untouched)
 *   3. Enrich each result with LLM explanation (Promise.allSettled — no blocking)
 *   4. Return standards + explanations + timing breakdown
 *
- * Output: { standards, latency: { retrieval_ms, llm_ms, total_ms } }
+ * @param {{ query: string, top_n?: number, rewrite?: boolean }} req.body
 * @returns {{ query: string, standards: Array, latency: { retrieval_ms: number, llm_ms: number, total_ms: number } }}
 */
 app.post("/api/recommend", async (req, res) => {
  const rawQuery = req.body?.query;
@@ -249,13 +287,13 @@ app.post("/api/recommend", async (req, res) => {
  const t0 = Date.now();
-  // Step 1 — Optional query rewrite (fires concurrently, falls back silently)
+  // Step 1 - Optional query rewrite (fires concurrently, falls back silently)
  let effectiveQuery = query;
  if (rewrite && process.env.GROQ_API_KEY) {
    effectiveQuery = await rewriteQuery(query.trim()); // never throws
  }
-  // Step 2 — Python retrieval (inference.py untouched)
+  // Step 2 - Python retrieval (inference.py untouched)
  let retrievalResult;
  const tRetStart = Date.now();
  try {
@@ -268,7 +306,7 @@ app.post("/api/recommend", async (req, res) => {
  const { results: retrieved, latency_seconds: pyLatency } = retrievalResult;
-  // Step 3 — LLM explanations fired in parallel (allSettled — never blocks on failure)
+  // Step 3 - LLM explanations fired in parallel (allSettled - never blocks on failure)
  const tLlmStart = Date.now();
  const explanationJobs = retrieved.map((r) => {
    const std = standardsById[r.standard_id];
@@ -281,7 +319,7 @@ app.post("/api/recommend", async (req, res) => {
  const explanations = await Promise.all(explanationJobs);
  const llmMs = Date.now() - tLlmStart;
-  // Step 4 — Assemble response
+  // Step 4 - Assemble response
  const standardsOut = retrieved.map((r, i) => {
    const std = standardsById[r.standard_id] || {};
    return {
@@ -316,15 +354,12 @@ app.post("/api/recommend", async (req, res) => {
  });
 });
 // ── POST /api/ask ───────────────────────────────────────────────────────────
 /**
- * Input:  { question: string, standard_id: string }
+ * POST /api/ask
- * Flow:
+ * Answers a question grounded in the best-matching chunk of a specific standard.
 *   1. Find best matching chunk for the question within the standard
 *   2. Pass chunk text to answerQuestion() — strictly grounded
 *   3. Return answer + chunk source info
 *
- * Output: { answer, source: { standard_id, section, chunk_id } }
+ * @param {{ question: string, standard_id: string }} req.body
 * @returns {{ answer: string, source: { standard_id: string, section: string, chunk_id: string }, latency: object }}
 */
 app.post("/api/ask", async (req, res) => {
  const question    = sanitizeText(req.body?.question, 500);
@@ -367,10 +402,12 @@ app.post("/api/ask", async (req, res) => {
  });
 });
 // ── POST /api/chat ──────────────────────────────────────────────────────────
 /**
- * Conversational QA grounded in a standard's full text.
+ * POST /api/chat
- * Uses answerQuestion() from llmService — key never leaves server.
+ * Conversational QA grounded in a standard's full text; 503 if GROQ_API_KEY is absent.
 *
 * @param {{ question: string, standard_id?: string }} req.body
 * @returns {{ answer: string }}
 */
 app.post("/api/chat", async (req, res) => {
  if (!process.env.GROQ_API_KEY) {
@@ -408,7 +445,6 @@ app.post("/api/chat", async (req, res) => {
  res.json({ answer });
 });
 // ── Start ───────────────────────────────────────────────────────────────────
 const server = app.listen(PORT, () => {
  console.log(`[init] BIS API running on http://localhost:${PORT}`);
 });
@@ -2,21 +2,25 @@
 /**
 * llmService.js
 * All Groq LLM calls live here. Three functions:
- *   generateExplanation(standard)  — 2-3 sentence plain-English summary
+ *   generateExplanation(standard)  - 2-3 sentence plain-English summary
- *   answerQuestion(question, chunk) — grounded QA, strict context-only
+ *   answerQuestion(question, chunk) - grounded QA, strict context-only
- *   rewriteQuery(query)            — optional query expansion
+ *   rewriteQuery(query)            - optional query expansion
 *
 * Key rules enforced here:
 *  - GROQ_API_KEY never leaves this file toward the client
 *  - max_tokens kept short to minimise latency (<400 tokens each)
- *  - Every function returns a fallback value on failure — callers never throw
+ *  - Every function returns a fallback value on failure - callers never throw
 */
 const GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions";
 const MODEL = "llama-3.1-8b-instant";
-// ── Core fetch wrapper ──────────────────────────────────────────────────────
+/**
-
+ * Sends a single chat-completion request to the Groq API.
 * @param {{ systemPrompt: string, userMessage: string, maxTokens?: number, temperature?: number }} options
 * @returns {Promise<string>} Trimmed text content from the first choice.
 * @throws {Error} If the API key is missing or the response is non-2xx.
 */
 async function _groqCall({ systemPrompt, userMessage, maxTokens = 256, temperature = 0.2 }) {
  const key = process.env.GROQ_API_KEY;
  if (!key) throw new Error("GROQ_API_KEY not set");
@@ -47,11 +51,9 @@ async function _groqCall({ systemPrompt, userMessage, maxTokens = 256, temperatu
  return data.choices?.[0]?.message?.content?.trim() ?? "";
 }
 // ── 1. generateExplanation ──────────────────────────────────────────────────
 /**
 * Produces a 2-3 sentence plain-English explanation of a standard.
- * Falls back to the standard's own summary on failure — never throws.
+ * Falls back to the standard's own summary on failure - never throws.
 *
 * @param {{ standard_id: string, title: string, summary?: string, content?: string, key_sections?: object }} standard
 * @returns {Promise<string>}
@@ -64,7 +66,7 @@ async function generateExplanation(standard) {
      systemPrompt:
        "You are a technical writer for the Bureau of Indian Standards (BIS). " +
        "Explain building material standards in simple English for engineers and contractors. " +
-        "Use ONLY the provided standard text — do not add, invent, or infer anything not explicitly stated. " +
+        "Use ONLY the provided standard text -- do not add, invent, or infer anything not explicitly stated. " +
        "Write exactly 2-3 sentences. No bullet points. No headings.",
      userMessage:
        `Explain this BIS standard in simple terms using ONLY the provided information:\n\n${context}`,
@@ -72,20 +74,18 @@ async function generateExplanation(standard) {
      temperature: 0.2,
    });
  } catch (err) {
-    // Graceful fallback — retrieval is unaffected
+    // Graceful fallback -- retrieval is unaffected
    return standard.summary || standard.title || "";
  }
 }
 // ── 2. answerQuestion ───────────────────────────────────────────────────────
 /**
 * Answers a user question strictly from a chunk of standard text.
 * Returns "Not found in standard" when context doesn't contain the answer.
 * Never throws.
 *
 * @param {string} question
- * @param {string} chunkText  — raw chunk text from standards_chunks.json
+ * @param {string} chunkText  -- raw chunk text from standards_chunks.json
 * @returns {Promise<string>}
 */
 async function answerQuestion(question, chunkText) {
@@ -111,11 +111,9 @@ async function answerQuestion(question, chunkText) {
  }
 }
 // ── 3. rewriteQuery (optional) ──────────────────────────────────────────────
 /**
 * Rewrites a vague natural-language query into precise IS-standard keywords.
- * Falls back to the original query on failure — retrieval is never blocked.
+ * Falls back to the original query on failure -- retrieval is never blocked.
 *
 * @param {string} query
 * @returns {Promise<string>}
@@ -129,12 +127,12 @@ async function rewriteQuery(query) {
        "You are a search query optimizer for the BIS SP-21 building materials standards database. " +
        "Convert the user's natural-language query into 3-6 precise technical keywords " +
        "suitable for searching Indian Standards (IS) documents. " +
-        "Output ONLY the keywords separated by spaces — no explanation, no punctuation.",
+        "Output ONLY the keywords separated by spaces -- no explanation, no punctuation.",
      userMessage: query.trim(),
      maxTokens: 40,
      temperature: 0.1,
    });
-    // Sanity check — if rewrite is too long or garbled, fall back
+    // Sanity check -- if rewrite is too long or garbled, fall back
    const words = rewritten.trim().split(/\s+/);
    if (words.length >= 2 && words.length <= 10) return rewritten.trim();
    return query;
@@ -143,10 +141,14 @@ async function rewriteQuery(query) {
  }
 }
-// ── Helpers ─────────────────────────────────────────────────────────────────
+/**
-
+ * Assembles a compact text block from a standard's fields for use as LLM context.
 * Caps each key section at 300 characters to keep token count low.
 * @param {{ standard_id: string, title: string, category?: string, summary?: string, key_sections?: object }} standard
 * @returns {string}
 */
 function buildStandardContext(standard) {
-  const parts = [`Standard: ${standard.standard_id} — ${standard.title}`];
+  const parts = [`Standard: ${standard.standard_id} -- ${standard.title}`];
  if (standard.category) parts.push(`Category: ${standard.category}`);
  if (standard.summary)  parts.push(`Summary: ${standard.summary}`);
@@ -1,6 +1,6 @@
 "use strict";
 /**
- * retrieverService.js — persistent Python daemon.
+ * retrieverService.js -- persistent Python daemon.
 *
 * Spawns retrieve.py ONCE when the Node server starts. The Python process
 * loads the FAISS index and BM25 index once, then serves queries via
@@ -17,12 +17,17 @@ const path       = require("path");
 const readline   = require("readline");
 const { EventEmitter } = require("events");
 /** @type {string} - Absolute path to bridge/retrieve.py. */
 const BRIDGE  = path.join(__dirname, "../bridge/retrieve.py");
 /** @type {string} - Repository root, used as cwd for the Python subprocess. */
 const ROOT    = path.join(__dirname, "../../..");
 /** @type {string} - Python executable; override with PYTHON_BIN env var. */
 const PYTHON  = process.env.PYTHON_BIN || "python";
-const BOOT_TIMEOUT_MS  = 90_000;   // Python cold-start budget
+/** @type {number} - Maximum milliseconds to wait for the daemon to signal ready on cold start. */
-const QUERY_TIMEOUT_MS = 10_000;   // per-query budget once warm
+const BOOT_TIMEOUT_MS  = 90_000;
 /** @type {number} - Maximum milliseconds to wait for a single query response once the daemon is warm. */
 const QUERY_TIMEOUT_MS = 10_000;
 class PythonRetriever extends EventEmitter {
  constructor() {
@@ -40,7 +45,7 @@ class PythonRetriever extends EventEmitter {
    this._ready = false;
    this._error = null;
-    console.log("[retriever] Starting Python daemon (first boot ~20s)…");
+    console.log("[retriever] Starting Python daemon (first boot ~20s)...");
    this._proc = spawn(PYTHON, [BRIDGE], {
      cwd: ROOT,
@@ -98,11 +103,11 @@ class PythonRetriever extends EventEmitter {
    try { msg = JSON.parse(raw); }
    catch { return; }  // ignore non-JSON (e.g. sentence-transformers progress bars)
-    // ── Startup handshake ──
+    // Startup handshake: wait for {"ready":true} before flushing the queue.
    if (!this._ready) {
      if (msg.ready) {
        this._ready = true;
-        console.log(`[retriever] Ready — flushing ${this._queue.length} queued request(s).`);
+        console.log(`[retriever] Ready -- flushing ${this._queue.length} queued request(s).`);
        // Send all queued requests in order
        for (const item of this._queue) {
          this._pending.push(item);
@@ -118,7 +123,7 @@ class PythonRetriever extends EventEmitter {
      return;
    }
-    // ── Query response — FIFO ──
+    // Query response -- resolve/reject the oldest in-flight request (FIFO).
    const item = this._pending.shift();
    if (!item) return;
    clearTimeout(item.timer);
@@ -178,7 +183,7 @@ class PythonRetriever extends EventEmitter {
  }
 }
-// Singleton — one daemon for the lifetime of the Node process
+// Singleton -- one daemon for the lifetime of the Node process
 const retriever = new PythonRetriever();
 module.exports = { retrieve: (q, n) => retriever.retrieve(q, n) };
@@ -1,5 +1,5 @@
 /**
- * start.js — safe server launcher
+ * start.js - safe server launcher
 * Kills any process already on PORT before starting index.js.
 * Run with: node web/server/start.js
 */
@@ -24,7 +24,7 @@ function killPort(port) {
      execSync(`fuser -k ${port}/tcp`, { stdio: "ignore" });
    }
  } catch {
-    // No process on that port — fine
+    // No process on that port -- fine
  }
 }