Merge pull request #26 from mlabonne/eqbench

Eqbench
mlabonne · Apr 4, 2024 · 7f00053 · 7f00053
2 parents f396921 + ba55b76
commit 7f00053
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 13 deletions.
diff --git a/llm_autoeval/table.py b/llm_autoeval/table.py
@@ -52,6 +52,10 @@ def calculate_average(data, task):
         elif task == "truthfulqa":
             value = data["results"]["truthfulqa_mc"]["mc2"]
             return 0.0 if math.isnan(value) else value * 100
+
+    elif BENCHMARK == "eq-bench":
+        if task == "eq-bench":
+            return data["results"]["eq_bench"]["eqbench,none"]
 
     raise NotImplementedError(f"Could not find task {task} for benchmark {BENCHMARK}")
 

diff --git a/main.py b/main.py
@@ -25,9 +25,11 @@ def _make_autoeval_summary(directory: str, elapsed_time: float) -> str:
         tasks = ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K"]
     elif BENCHMARK == "nous":
         tasks = ["AGIEval", "GPT4All", "TruthfulQA", "Bigbench"]
+    elif BENCHMARK == "eq-bench":
+        tasks = ["EQ-Bench"]
     else:
         raise NotImplementedError(
-            f"BENCHMARK should be 'openllm' or 'nous' (current value = {BENCHMARK})"
+            f"The benchmark {BENCHMARK} could not be found."
         )
 
     # Load results
@@ -84,20 +86,12 @@ def _make_lighteval_summary(directory: str, elapsed_time: float) -> str:
     return summary
 
 
-def _make_eqbench_summary(directory: str, elapsed_time: float) -> str:
-    result_dict = _get_result_dict(directory)
-    summary = f"## {MODEL_ID.split('/')[-1]} - {BENCHMARK.capitalize()}\n\n"
-    return summary
-
-
 def main(directory: str, elapsed_time: float) -> None:
     # Tasks
-    if BENCHMARK == "openllm" or BENCHMARK == "nous":
+    if BENCHMARK == "openllm" or BENCHMARK == "nous" or BENCHMARK == "eq-bench":
         summary = _make_autoeval_summary(directory, elapsed_time)
     elif BENCHMARK == "lighteval":
         summary = _make_lighteval_summary(directory, elapsed_time)
-    elif BENCHMARK == "eq-bench":
-        summary = _make_eqbench_summary(directory, elapsed_time)
     else:
         raise NotImplementedError(
             f"BENCHMARK should be 'openllm' or 'nous' (current value = {BENCHMARK})"

diff --git a/runpod.sh b/runpod.sh
@@ -194,18 +194,18 @@ elif [ "$BENCHMARK" == "eq-bench" ]; then
     pip install accelerate
 
     benchmark="eq-bench"
-    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/6] =================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/1] =================="
     accelerate launch -m lm_eval \
         --model hf \
         --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
         --tasks eq_bench \
         --num_fewshot 0 \
         --batch_size auto \
-        --output_path ./${benchmark}.json
+        --output_path ./evals/${benchmark}.json
 
     end=$(date +%s)
 
-    python ../llm-autoeval/main.py ./evals/results $(($end-$start))
+    python ../llm-autoeval/main.py ./evals $(($end-$start))
 
 else
     echo "Error: Invalid BENCHMARK value. Please set BENCHMARK to 'nous', 'openllm', or 'lighteval'."