From e3a9999d73eb91aa2b75dc93f9f74c1fd131f3bd Mon Sep 17 00:00:00 2001 From: Maxime Labonne Date: Thu, 28 Mar 2024 21:28:05 +0000 Subject: [PATCH] fix get version --- llm_autoeval/table.py | 9 ++++----- main.py | 6 ++++++ runpod.sh | 41 +++++++++++++++++++++++++++++++---------- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/llm_autoeval/table.py b/llm_autoeval/table.py index becc738..33dd226 100644 --- a/llm_autoeval/table.py +++ b/llm_autoeval/table.py @@ -67,7 +67,9 @@ def make_table(result_dict, task): values = [] for k, dic in sorted(result_dict["results"].items()): - version = result_dict["versions"][k] + # Correctly use get() to safely access the dictionary + version = result_dict["versions"].get(k, "N/A") # Use get() on the versions dictionary + percent = k == "squad2" for m, v in dic.items(): if m.endswith("_stderr"): @@ -93,10 +95,7 @@ def make_table(result_dict, task): # If conversion fails, use the original string value v_formatted = v - if isinstance(version, str): - values.append([k, version, m, v_formatted, "", ""]) - else: - values.append([k, version, m, v_formatted, "", ""]) + values.append([k, version, m, v_formatted, "", ""]) k = "" version = "" diff --git a/main.py b/main.py index 6a3956c..4e69978 100644 --- a/main.py +++ b/main.py @@ -83,6 +83,10 @@ def _make_lighteval_summary(directory: str, elapsed_time: float) -> str: summary += final_table return summary +def _make_eqbench_summary(directory: str, elapsed_time: float) -> str: + result_dict = _get_result_dict(directory) + summary = f"## {MODEL_ID.split('/')[-1]} - {BENCHMARK.capitalize()}\n\n" + return summary def main(directory: str, elapsed_time: float) -> None: # Tasks @@ -90,6 +94,8 @@ def main(directory: str, elapsed_time: float) -> None: summary = _make_autoeval_summary(directory, elapsed_time) elif BENCHMARK == "lighteval": summary = _make_lighteval_summary(directory, elapsed_time) + elif BENCHMARK == "eq-bench": + summary = _make_eqbench_summary(directory, elapsed_time) else: raise NotImplementedError( f"BENCHMARK should be 'openllm' or 'nous' (current value = {BENCHMARK})" diff --git a/runpod.sh b/runpod.sh index a53902c..5602ae4 100644 --- a/runpod.sh +++ b/runpod.sh @@ -37,7 +37,7 @@ if [ "$BENCHMARK" == "nous" ]; then pip install -e . benchmark="agieval" - echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')==================" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/4] ==================" python main.py \ --model hf-causal \ --model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \ @@ -47,7 +47,7 @@ if [ "$BENCHMARK" == "nous" ]; then --output_path ./${benchmark}.json benchmark="gpt4all" - echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')==================" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [2/4] ==================" python main.py \ --model hf-causal \ --model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \ @@ -57,7 +57,7 @@ if [ "$BENCHMARK" == "nous" ]; then --output_path ./${benchmark}.json benchmark="truthfulqa" - echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')==================" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [3/4] ==================" python main.py \ --model hf-causal \ --model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \ @@ -67,7 +67,7 @@ if [ "$BENCHMARK" == "nous" ]; then --output_path ./${benchmark}.json benchmark="bigbench" - echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')==================" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [4/4] ==================" python main.py \ --model hf-causal \ --model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \ @@ -88,7 +88,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then pip install accelerate benchmark="arc" - echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')==================" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/6] ==================" accelerate launch -m lm_eval \ --model hf \ --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \ @@ -98,7 +98,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then --output_path ./${benchmark}.json benchmark="hellaswag" - echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')==================" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [2/6] ==================" accelerate launch -m lm_eval \ --model hf \ --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \ @@ -108,7 +108,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then --output_path ./${benchmark}.json benchmark="mmlu" - echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')==================" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [3/6] ==================" accelerate launch -m lm_eval \ --model hf \ --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \ @@ -119,7 +119,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then --output_path ./${benchmark}.json benchmark="truthfulqa" - echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')==================" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [4/6] ==================" accelerate launch -m lm_eval \ --model hf \ --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \ @@ -129,7 +129,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then --output_path ./${benchmark}.json benchmark="winogrande" - echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')==================" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [5/6] ==================" accelerate launch -m lm_eval \ --model hf \ --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \ @@ -139,7 +139,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then --output_path ./${benchmark}.json benchmark="gsm8k" - echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')==================" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [6/6] ==================" accelerate launch -m lm_eval \ --model hf \ --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \ @@ -186,6 +186,27 @@ elif [ "$BENCHMARK" == "lighteval" ]; then end=$(date +%s) python ../llm-autoeval/main.py ./evals/results $(($end-$start)) + +elif [ "$BENCHMARK" == "eq-bench" ]; then + git clone https://github.com/EleutherAI/lm-evaluation-harness + cd lm-evaluation-harness + pip install -e . + pip install accelerate + + benchmark="eq-bench" + echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/6] ==================" + accelerate launch -m lm_eval \ + --model hf \ + --model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \ + --tasks eq_bench \ + --num_fewshot 0 \ + --batch_size auto \ + --output_path ./${benchmark}.json + + end=$(date +%s) + + python ../llm-autoeval/main.py ./evals/results $(($end-$start)) + else echo "Error: Invalid BENCHMARK value. Please set BENCHMARK to 'nous', 'openllm', or 'lighteval'." fi