diff --git a/scripts/eval_script.py b/scripts/eval_script.py new file mode 100644 index 0000000..249653b --- /dev/null +++ b/scripts/eval_script.py @@ -0,0 +1,77 @@ +import json +import argparse +import sys + + +def normalize_std(std_string): + """Normalizes the standard name by removing spaces and converting to lowercase for fair matching.""" + return str(std_string).replace(" ", "").lower() + + +def evaluate_results(results_file): + try: + with open(results_file, "r") as f: + data = json.load(f) + except Exception as e: + print(f"Error reading results file: {e}") + sys.exit(1) + + total_queries = len(data) + if total_queries == 0: + print("No queries found in the result file.") + return + + hits_at_3 = 0 + mrr_sum_at_5 = 0.0 + total_latency = 0.0 + + for item in data: + # Normalize expected and retrieved standards + expected = set(normalize_std(std) for std in item.get("expected_standards", [])) + retrieved = [normalize_std(std) for std in item.get("retrieved_standards", [])] + latency = item.get("latency_seconds", 0.0) + + total_latency += latency + + # 1. Calculate Hit Rate @3 (Is at least 1 expected standard in top 3?) + top_3_retrieved = retrieved[:3] + if any(std in expected for std in top_3_retrieved): + hits_at_3 += 1 + + # 2. Calculate MRR @5 (Mean Reciprocal Rank of first correct standard in top 5) + top_5_retrieved = retrieved[:5] + mrr = 0.0 + for rank, std in enumerate(top_5_retrieved, start=1): + if std in expected: + mrr = 1.0 / rank + break # Only care about the first correct standard + mrr_sum_at_5 += mrr + + # Calculate Final Metrics + hit_rate_3 = (hits_at_3 / total_queries) * 100 + mrr_5 = mrr_sum_at_5 / total_queries + avg_latency = total_latency / total_queries + + print("=" * 40) + print(" BIS HACKATHON EVALUATION RESULTS") + print("=" * 40) + print(f"Total Queries Evaluated : {total_queries}") + print(f"Hit Rate @3 : {hit_rate_3:.2f}% \t(Target: >80%)") + print(f"MRR @5 : {mrr_5:.4f} \t(Target: >0.7)") + print(f"Avg Latency : {avg_latency:.2f} sec \t(Target: <5 seconds)") + print("=" * 40) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Evaluate RAG Pipeline Results for BIS Hackathon" + ) + parser.add_argument( + "--results", + type=str, + required=True, + help="Path to the participant's output JSON file", + ) + args = parser.parse_args() + + evaluate_results(args.results)