From e3a9999d73eb91aa2b75dc93f9f74c1fd131f3bd Mon Sep 17 00:00:00 2001
From: Maxime Labonne <labonne.maxime@gmail.com>
Date: Thu, 28 Mar 2024 21:28:05 +0000
Subject: [PATCH] fix get version

---
 llm_autoeval/table.py |  9 ++++-----
 main.py               |  6 ++++++
 runpod.sh             | 41 +++++++++++++++++++++++++++++++----------
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/llm_autoeval/table.py b/llm_autoeval/table.py
index becc738..33dd226 100644
--- a/llm_autoeval/table.py
+++ b/llm_autoeval/table.py
@@ -67,7 +67,9 @@ def make_table(result_dict, task):
     values = []
 
     for k, dic in sorted(result_dict["results"].items()):
-        version = result_dict["versions"][k]
+        # Correctly use get() to safely access the dictionary
+        version = result_dict["versions"].get(k, "N/A")  # Use get() on the versions dictionary
+        
         percent = k == "squad2"
         for m, v in dic.items():
             if m.endswith("_stderr"):
@@ -93,10 +95,7 @@ def make_table(result_dict, task):
                         # If conversion fails, use the original string value
                         v_formatted = v
 
-                    if isinstance(version, str):
-                        values.append([k, version, m, v_formatted, "", ""])
-                    else:
-                        values.append([k, version, m, v_formatted, "", ""])
+                    values.append([k, version, m, v_formatted, "", ""])
 
             k = ""
             version = ""
diff --git a/main.py b/main.py
index 6a3956c..4e69978 100644
--- a/main.py
+++ b/main.py
@@ -83,6 +83,10 @@ def _make_lighteval_summary(directory: str, elapsed_time: float) -> str:
     summary += final_table
     return summary
 
+def _make_eqbench_summary(directory: str, elapsed_time: float) -> str:
+    result_dict = _get_result_dict(directory)
+    summary = f"## {MODEL_ID.split('/')[-1]} - {BENCHMARK.capitalize()}\n\n"
+    return summary
 
 def main(directory: str, elapsed_time: float) -> None:
     # Tasks
@@ -90,6 +94,8 @@ def main(directory: str, elapsed_time: float) -> None:
         summary = _make_autoeval_summary(directory, elapsed_time)
     elif BENCHMARK == "lighteval":
         summary = _make_lighteval_summary(directory, elapsed_time)
+    elif BENCHMARK == "eq-bench":
+        summary = _make_eqbench_summary(directory, elapsed_time)
     else:
         raise NotImplementedError(
             f"BENCHMARK should be 'openllm' or 'nous' (current value = {BENCHMARK})"
diff --git a/runpod.sh b/runpod.sh
index a53902c..5602ae4 100644
--- a/runpod.sh
+++ b/runpod.sh
@@ -37,7 +37,7 @@ if [ "$BENCHMARK" == "nous" ]; then
     pip install -e .
 
     benchmark="agieval"
-    echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/4] =================="
     python main.py \
         --model hf-causal \
         --model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \
@@ -47,7 +47,7 @@ if [ "$BENCHMARK" == "nous" ]; then
         --output_path ./${benchmark}.json
 
     benchmark="gpt4all"
-    echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [2/4] =================="
     python main.py \
         --model hf-causal \
         --model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \
@@ -57,7 +57,7 @@ if [ "$BENCHMARK" == "nous" ]; then
         --output_path ./${benchmark}.json
 
     benchmark="truthfulqa"
-    echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [3/4] =================="
     python main.py \
         --model hf-causal \
         --model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \
@@ -67,7 +67,7 @@ if [ "$BENCHMARK" == "nous" ]; then
         --output_path ./${benchmark}.json
 
     benchmark="bigbench"
-    echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [4/4] =================="
     python main.py \
         --model hf-causal \
         --model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \
@@ -88,7 +88,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
     pip install accelerate
 
     benchmark="arc"
-    echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/6] =================="
     accelerate launch -m lm_eval \
         --model hf \
         --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
@@ -98,7 +98,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
         --output_path ./${benchmark}.json
 
     benchmark="hellaswag"
-    echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [2/6] =================="
     accelerate launch -m lm_eval \
         --model hf \
         --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
@@ -108,7 +108,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
         --output_path ./${benchmark}.json
 
     benchmark="mmlu"
-    echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [3/6] =================="
     accelerate launch -m lm_eval \
         --model hf \
         --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
@@ -119,7 +119,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
         --output_path ./${benchmark}.json
     
     benchmark="truthfulqa"
-    echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [4/6] =================="
     accelerate launch -m lm_eval \
         --model hf \
         --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
@@ -129,7 +129,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
         --output_path ./${benchmark}.json
     
     benchmark="winogrande"
-    echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [5/6] =================="
     accelerate launch -m lm_eval \
         --model hf \
         --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
@@ -139,7 +139,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
         --output_path ./${benchmark}.json
     
     benchmark="gsm8k"
-    echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [6/6] =================="
     accelerate launch -m lm_eval \
         --model hf \
         --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
@@ -186,6 +186,27 @@ elif [ "$BENCHMARK" == "lighteval" ]; then
     end=$(date +%s)
 
     python ../llm-autoeval/main.py ./evals/results $(($end-$start))
+
+elif [ "$BENCHMARK" == "eq-bench" ]; then
+    git clone https://github.com/EleutherAI/lm-evaluation-harness
+    cd lm-evaluation-harness
+    pip install -e .
+    pip install accelerate
+
+    benchmark="eq-bench"
+    echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/6] =================="
+    accelerate launch -m lm_eval \
+        --model hf \
+        --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
+        --tasks eq_bench \
+        --num_fewshot 0 \
+        --batch_size auto \
+        --output_path ./${benchmark}.json
+
+    end=$(date +%s)
+
+    python ../llm-autoeval/main.py ./evals/results $(($end-$start))
+
 else
     echo "Error: Invalid BENCHMARK value. Please set BENCHMARK to 'nous', 'openllm', or 'lighteval'."
 fi