chore: add evaluation scripts.

2026-04-28 23:54:45 +05:30
parent 13fd04e7e1
commit 434c8b288a
1 changed files with 77 additions and 0 deletions
@@ -0,0 +1,77 @@
+import json
+import argparse
+import sys
+
+
+def normalize_std(std_string):
+    """Normalizes the standard name by removing spaces and converting to lowercase for fair matching."""
+    return str(std_string).replace(" ", "").lower()
+
+
+def evaluate_results(results_file):
+    try:
+        with open(results_file, "r") as f:
+            data = json.load(f)
+    except Exception as e:
+        print(f"Error reading results file: {e}")
+        sys.exit(1)
+
+    total_queries = len(data)
+    if total_queries == 0:
+        print("No queries found in the result file.")
+        return
+
+    hits_at_3 = 0
+    mrr_sum_at_5 = 0.0
+    total_latency = 0.0
+
+    for item in data:
+        # Normalize expected and retrieved standards
+        expected = set(normalize_std(std) for std in item.get("expected_standards", []))
+        retrieved = [normalize_std(std) for std in item.get("retrieved_standards", [])]
+        latency = item.get("latency_seconds", 0.0)
+
+        total_latency += latency
+
+        # 1. Calculate Hit Rate @3 (Is at least 1 expected standard in top 3?)
+        top_3_retrieved = retrieved[:3]
+        if any(std in expected for std in top_3_retrieved):
+            hits_at_3 += 1
+
+        # 2. Calculate MRR @5 (Mean Reciprocal Rank of first correct standard in top 5)
+        top_5_retrieved = retrieved[:5]
+        mrr = 0.0
+        for rank, std in enumerate(top_5_retrieved, start=1):
+            if std in expected:
+                mrr = 1.0 / rank
+                break  # Only care about the first correct standard
+        mrr_sum_at_5 += mrr
+
+    # Calculate Final Metrics
+    hit_rate_3 = (hits_at_3 / total_queries) * 100
+    mrr_5 = mrr_sum_at_5 / total_queries
+    avg_latency = total_latency / total_queries
+
+    print("=" * 40)
+    print("   BIS HACKATHON EVALUATION RESULTS")
+    print("=" * 40)
+    print(f"Total Queries Evaluated : {total_queries}")
+    print(f"Hit Rate @3             : {hit_rate_3:.2f}% \t(Target: >80%)")
+    print(f"MRR @5                  : {mrr_5:.4f} \t(Target: >0.7)")
+    print(f"Avg Latency             : {avg_latency:.2f} sec \t(Target: <5 seconds)")
+    print("=" * 40)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Evaluate RAG Pipeline Results for BIS Hackathon"
+    )
+    parser.add_argument(
+        "--results",
+        type=str,
+        required=True,
+        help="Path to the participant's output JSON file",
+    )
+    args = parser.parse_args()
+
+    evaluate_results(args.results)