chore: add evaluation scripts.
This commit is contained in:
@@ -0,0 +1,77 @@
|
||||
import json
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
|
||||
def normalize_std(std_string):
|
||||
"""Normalizes the standard name by removing spaces and converting to lowercase for fair matching."""
|
||||
return str(std_string).replace(" ", "").lower()
|
||||
|
||||
|
||||
def evaluate_results(results_file):
|
||||
try:
|
||||
with open(results_file, "r") as f:
|
||||
data = json.load(f)
|
||||
except Exception as e:
|
||||
print(f"Error reading results file: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
total_queries = len(data)
|
||||
if total_queries == 0:
|
||||
print("No queries found in the result file.")
|
||||
return
|
||||
|
||||
hits_at_3 = 0
|
||||
mrr_sum_at_5 = 0.0
|
||||
total_latency = 0.0
|
||||
|
||||
for item in data:
|
||||
# Normalize expected and retrieved standards
|
||||
expected = set(normalize_std(std) for std in item.get("expected_standards", []))
|
||||
retrieved = [normalize_std(std) for std in item.get("retrieved_standards", [])]
|
||||
latency = item.get("latency_seconds", 0.0)
|
||||
|
||||
total_latency += latency
|
||||
|
||||
# 1. Calculate Hit Rate @3 (Is at least 1 expected standard in top 3?)
|
||||
top_3_retrieved = retrieved[:3]
|
||||
if any(std in expected for std in top_3_retrieved):
|
||||
hits_at_3 += 1
|
||||
|
||||
# 2. Calculate MRR @5 (Mean Reciprocal Rank of first correct standard in top 5)
|
||||
top_5_retrieved = retrieved[:5]
|
||||
mrr = 0.0
|
||||
for rank, std in enumerate(top_5_retrieved, start=1):
|
||||
if std in expected:
|
||||
mrr = 1.0 / rank
|
||||
break # Only care about the first correct standard
|
||||
mrr_sum_at_5 += mrr
|
||||
|
||||
# Calculate Final Metrics
|
||||
hit_rate_3 = (hits_at_3 / total_queries) * 100
|
||||
mrr_5 = mrr_sum_at_5 / total_queries
|
||||
avg_latency = total_latency / total_queries
|
||||
|
||||
print("=" * 40)
|
||||
print(" BIS HACKATHON EVALUATION RESULTS")
|
||||
print("=" * 40)
|
||||
print(f"Total Queries Evaluated : {total_queries}")
|
||||
print(f"Hit Rate @3 : {hit_rate_3:.2f}% \t(Target: >80%)")
|
||||
print(f"MRR @5 : {mrr_5:.4f} \t(Target: >0.7)")
|
||||
print(f"Avg Latency : {avg_latency:.2f} sec \t(Target: <5 seconds)")
|
||||
print("=" * 40)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Evaluate RAG Pipeline Results for BIS Hackathon"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--results",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the participant's output JSON file",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
evaluate_results(args.results)
|
||||
Reference in New Issue
Block a user